Steboss89 · June 2, 2022 18:09
diff --git a/tsne_word_embd.py b/tsne_word_embd.py
 def get_word_frequencies(text):
    r""" This function return a Counter with the most common words
    in a given text
    
    Parameters
    ----------
    text: df['text'].tolist()
    
    Return 
    ------
    freq: Counter, most common words with their freqs
    """
    frequencies = Counter()
    tokens = [nltk.word_tokenize(sentence) for sentence in text]
    for token in tokens:
        for word in token:
            frequencies[word] += 1
    freq = frequencies.most_common()
    return freq

 def most_similar(input_word, num_similar):
    r""" This function uses word2vec to find the most 
    num_similar similar words wrt a given 
    input_word
    
    Parameters
    -----------
    input_word: str, input word 
    num_similar: int, how many similar words we want to get
    
    Return 
    ------
    output: list, input word and found words
    """
    
    sim = word2vec.wv.most_similar(input_word, topn=num_similar)
    output = []
    found = []
    for item in sim:
        w, n = item
        found.append(w)
    output = [input_word, found]
    return output


 def calculate_t_sne(word2vec):
    r""" Main function to copmute the t-sne representation 
    of the computed word2vec
    """
    vocab = word2vec.wv.vocab.keys()
    vocab_len = len(vocab)
    dim0 = word2vec.wv[vocab].shape[1]
    arr = np.empty((0, dim0), dtype='f')
    labels = []
    vectors_file = os.path.join(save_dir, "vocab_vectors.npy")
    labels_file = os.path.join(save_dir, "labels.json")

    print("Creating an array of vectors for each word in the vocab")
    for count, word in enumerate(vocab):
        if count % 50 == 0:
            print_progress(count, vocab_len)
        w_vec = word2vec[word]
        labels.append(word)
        arr = np.append(arr, np.array([w_vec]), axis=0)
    save_bin(arr, vectors_file)
    save_json(labels, labels_file)

    x_coords = None
    y_coords = None
    x_c_filename = os.path.join(save_dir, "x_coords.npy")
    y_c_filename = os.path.join(save_dir, "y_coords.npy")
    print("Computing T-SNE for array of length: " + str(len(arr)))
    tsne = TSNE(n_components=2, random_state=1, verbose=1)
    np.set_printoptions(suppress=True)
    Y = tsne.fit_transform(arr)
    x_coords = Y[:, 0]
    y_coords = Y[:, 1]
    print("Saving coords.")
    save_bin(x_coords, x_c_filename)
    save_bin(y_coords, y_c_filename)
    return x_coords, y_coords, labels, arr

  
 def t_sne_scatterplot(word):
    r""" Function to plot the t-sne result for a given word 
    Parameters
    ----------
    word: list, given word we want to plot the w2v-tsne plot + its neighbours
    """
    vocab = word2vec.wv.vocab.keys()
    vocab_len = len(vocab)
    vocab_elems = [key for key in vocab]
    dim0 = word2vec.wv[vocab_elems[0]].shape[0]
    arr = np.empty((0, dim0), dtype='f')
    w_labels = [word]
    # check all the similar words around 
    nearby = word2vec.wv.similar_by_word(word, topn=num_similar)
    arr = np.append(arr, np.array([word2vec[word]]), axis=0)
    for n in nearby:
        w_vec = word2vec[n[0]]
        w_labels.append(n[0])
        arr = np.append(arr, np.array([w_vec]), axis=0)

    tsne = TSNE(n_components=2, random_state=1)
    np.set_printoptions(suppress=True)
    Y = tsne.fit_transform(arr)
    x_coords = Y[:, 0]
    y_coords = Y[:, 1]

    plt.rc("font", size=16)
    plt.figure(figsize=(16, 12), dpi=80)
    plt.scatter(x_coords[0], y_coords[0], s=800, marker="o", color="blue")
    plt.scatter(x_coords[1:], y_coords[1:], s=200, marker="o", color="red")

    for label, x, y in zip(w_labels, x_coords, y_coords):
        plt.annotate(label.upper(), xy=(x, y), xytext=(0, 0), textcoords='offset points')
    plt.xlim(x_coords.min()-50, x_coords.max()+50)
    plt.ylim(y_coords.min()-50, y_coords.max()+50)
    filename = os.path.join(plot_dir, word + "_tsne.png")
    plt.savefig(filename)
    plt.close()

    
 def test_word2vec(test_words):
    r""" Function to check if a test word exists within our vocabulary
    and return the word along with its most similar, thorugh word 
    embeddings
    
    Parameters
    ----------
    test_words: str, given word to check
    
    Return 
    ------
    output: list, input word and associated words 
    """
    vocab = word2vec.wv.vocab.keys()
    vocab_len = len(vocab)
    output = []
    associations = {}
    test_items = test_words
    for count, word in enumerate(test_items):
        if word in vocab:
            print("[" + str(count+1) + "] Testing: " + word)
            if word not in associations:
                associations[word] = []
        similar = most_similar(word, num_similar)
        t_sne_scatterplot(word)
        output.append(similar)
        for s in similar[1]:
            if s not in associations[word]:
                associations[word].append(s)
            else:
                print("Word " + word + " not in vocab")
    filename = os.path.join(save_dir, "word2vec_test.json")
    save_json(output, filename)
    filename = os.path.join(save_dir, "associations.json")
    save_json(associations, filename)
    filename = os.path.join(save_dir, "associations.csv")
    handle = io.open(filename, "w", encoding="utf-8")
    handle.write(u"Source,Target\n")
    for w, sim in associations.items():
        for s in sim:
            handle.write(w + u"," + s + u"\n")
    return output

  
 def show_cluster_locations(results, labels, x_coords, y_coords):
    r""" function to retrieve the cluster location from t-sne 
    Parameters
    ----------
    results: list, word and its neighbours 
    labels: words
    x_coords, y_coords: float, x-y coordinates 2D plane 
    """
    for item in results:
        name = item[0]
        print("Plotting graph for " + name)
        similar = item[1]
        in_set_x = []
        in_set_y = []
        out_set_x = []
        out_set_y = []
        name_x = 0
        name_y = 0
        for count, word in enumerate(labels):
            xc = x_coords[count]
            yc = y_coords[count]
            if word == name:
                name_x = xc
                name_y = yc
            elif word in similar:
                in_set_x.append(xc)
                in_set_y.append(yc)
            else:
                out_set_x.append(xc)
                out_set_y.append(yc)
        plt.figure(figsize=(16, 12), dpi=80)
        plt.scatter(name_x, name_y, s=400, marker="o", c="blue")
        plt.scatter(in_set_x, in_set_y, s=80, marker="o", c="red")
        plt.scatter(out_set_x, out_set_y, s=8, marker=".", c="black")
        filename = os.path.join(big_plot_dir, name + "_tsne.png")
        plt.savefig(filename)
        plt.close()

    
 x_coords, y_coords, labels, arr = calculate_t_sne(word2vec)
 # and let's save the t-sne plots with the words clusters 
 frequencies = get_word_frequencies(df['text'].tolist())
 # check the first 50 most frequent words and see if they're in the w2v
 for item in frequencies[:50]:
    test_words.append(item[0])
 results = test_word2vec(test_words)
 # and once we have all the word + neighbors let's see how the t-sne has grouped them 
 show_cluster_locations(results, labels, x_coords, y_coords)
	def get_word_frequencies(text):
	r""" This function return a Counter with the most common words
	in a given text

	Parameters
	----------
	text: df['text'].tolist()

	Return
	------
	freq: Counter, most common words with their freqs
	"""
	frequencies = Counter()
	tokens = [nltk.word_tokenize(sentence) for sentence in text]
	for token in tokens:
	for word in token:
	frequencies[word] += 1
	freq = frequencies.most_common()
	return freq

	def most_similar(input_word, num_similar):
	r""" This function uses word2vec to find the most
	num_similar similar words wrt a given
	input_word

	Parameters
	-----------
	input_word: str, input word
	num_similar: int, how many similar words we want to get

	Return
	------
	output: list, input word and found words
	"""

	sim = word2vec.wv.most_similar(input_word, topn=num_similar)
	output = []
	found = []
	for item in sim:
	w, n = item
	found.append(w)
	output = [input_word, found]
	return output


	def calculate_t_sne(word2vec):
	r""" Main function to copmute the t-sne representation
	of the computed word2vec
	"""
	vocab = word2vec.wv.vocab.keys()
	vocab_len = len(vocab)
	dim0 = word2vec.wv[vocab].shape[1]
	arr = np.empty((0, dim0), dtype='f')
	labels = []
	vectors_file = os.path.join(save_dir, "vocab_vectors.npy")
	labels_file = os.path.join(save_dir, "labels.json")

	print("Creating an array of vectors for each word in the vocab")
	for count, word in enumerate(vocab):
	if count % 50 == 0:
	print_progress(count, vocab_len)
	w_vec = word2vec[word]
	labels.append(word)
	arr = np.append(arr, np.array([w_vec]), axis=0)
	save_bin(arr, vectors_file)
	save_json(labels, labels_file)

	x_coords = None
	y_coords = None
	x_c_filename = os.path.join(save_dir, "x_coords.npy")
	y_c_filename = os.path.join(save_dir, "y_coords.npy")
	print("Computing T-SNE for array of length: " + str(len(arr)))
	tsne = TSNE(n_components=2, random_state=1, verbose=1)
	np.set_printoptions(suppress=True)
	Y = tsne.fit_transform(arr)
	x_coords = Y[:, 0]
	y_coords = Y[:, 1]
	print("Saving coords.")
	save_bin(x_coords, x_c_filename)
	save_bin(y_coords, y_c_filename)
	return x_coords, y_coords, labels, arr


	def t_sne_scatterplot(word):
	r""" Function to plot the t-sne result for a given word
	Parameters
	----------
	word: list, given word we want to plot the w2v-tsne plot + its neighbours
	"""
	vocab = word2vec.wv.vocab.keys()
	vocab_len = len(vocab)
	vocab_elems = [key for key in vocab]
	dim0 = word2vec.wv[vocab_elems[0]].shape[0]
	arr = np.empty((0, dim0), dtype='f')
	w_labels = [word]
	# check all the similar words around
	nearby = word2vec.wv.similar_by_word(word, topn=num_similar)
	arr = np.append(arr, np.array([word2vec[word]]), axis=0)
	for n in nearby:
	w_vec = word2vec[n[0]]
	w_labels.append(n[0])
	arr = np.append(arr, np.array([w_vec]), axis=0)

	tsne = TSNE(n_components=2, random_state=1)
	np.set_printoptions(suppress=True)
	Y = tsne.fit_transform(arr)
	x_coords = Y[:, 0]
	y_coords = Y[:, 1]

	plt.rc("font", size=16)
	plt.figure(figsize=(16, 12), dpi=80)
	plt.scatter(x_coords[0], y_coords[0], s=800, marker="o", color="blue")
	plt.scatter(x_coords[1:], y_coords[1:], s=200, marker="o", color="red")

	for label, x, y in zip(w_labels, x_coords, y_coords):
	plt.annotate(label.upper(), xy=(x, y), xytext=(0, 0), textcoords='offset points')
	plt.xlim(x_coords.min()-50, x_coords.max()+50)
	plt.ylim(y_coords.min()-50, y_coords.max()+50)
	filename = os.path.join(plot_dir, word + "_tsne.png")
	plt.savefig(filename)
	plt.close()


	def test_word2vec(test_words):
	r""" Function to check if a test word exists within our vocabulary
	and return the word along with its most similar, thorugh word
	embeddings

	Parameters
	----------
	test_words: str, given word to check

	Return
	------
	output: list, input word and associated words
	"""
	vocab = word2vec.wv.vocab.keys()
	vocab_len = len(vocab)
	output = []
	associations = {}
	test_items = test_words
	for count, word in enumerate(test_items):
	if word in vocab:
	print("[" + str(count+1) + "] Testing: " + word)
	if word not in associations:
	associations[word] = []
	similar = most_similar(word, num_similar)
	t_sne_scatterplot(word)
	output.append(similar)
	for s in similar[1]:
	if s not in associations[word]:
	associations[word].append(s)
	else:
	print("Word " + word + " not in vocab")
	filename = os.path.join(save_dir, "word2vec_test.json")
	save_json(output, filename)
	filename = os.path.join(save_dir, "associations.json")
	save_json(associations, filename)
	filename = os.path.join(save_dir, "associations.csv")
	handle = io.open(filename, "w", encoding="utf-8")
	handle.write(u"Source,Target\n")
	for w, sim in associations.items():
	for s in sim:
	handle.write(w + u"," + s + u"\n")
	return output


	def show_cluster_locations(results, labels, x_coords, y_coords):
	r""" function to retrieve the cluster location from t-sne
	Parameters
	----------
	results: list, word and its neighbours
	labels: words
	x_coords, y_coords: float, x-y coordinates 2D plane
	"""
	for item in results:
	name = item[0]
	print("Plotting graph for " + name)
	similar = item[1]
	in_set_x = []
	in_set_y = []
	out_set_x = []
	out_set_y = []
	name_x = 0
	name_y = 0
	for count, word in enumerate(labels):
	xc = x_coords[count]
	yc = y_coords[count]
	if word == name:
	name_x = xc
	name_y = yc
	elif word in similar:
	in_set_x.append(xc)
	in_set_y.append(yc)
	else:
	out_set_x.append(xc)
	out_set_y.append(yc)
	plt.figure(figsize=(16, 12), dpi=80)
	plt.scatter(name_x, name_y, s=400, marker="o", c="blue")
	plt.scatter(in_set_x, in_set_y, s=80, marker="o", c="red")
	plt.scatter(out_set_x, out_set_y, s=8, marker=".", c="black")
	filename = os.path.join(big_plot_dir, name + "_tsne.png")
	plt.savefig(filename)
	plt.close()


	x_coords, y_coords, labels, arr = calculate_t_sne(word2vec)
	# and let's save the t-sne plots with the words clusters
	frequencies = get_word_frequencies(df['text'].tolist())
	# check the first 50 most frequent words and see if they're in the w2v
	for item in frequencies[:50]:
	test_words.append(item[0])
	results = test_word2vec(test_words)
	# and once we have all the word + neighbors let's see how the t-sne has grouped them
	show_cluster_locations(results, labels, x_coords, y_coords)