Last active
March 30, 2022 20:26
-
-
Save BrikerMan/7bd4e4bd0a00ac9076986148afc06507 to your computer and use it in GitHub Desktop.
Convert gensim word2vec to tensorboard visualized model, detail: https://eliyar.biz/using-pre-trained-gensim-word2vector-in-a-keras-model-and-visualizing/
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from gensim.models import KeyedVectors | |
# Load gensim word2vec | |
w2v_path = '<Gensim File Path>' | |
w2v = KeyedVectors.load_word2vec_format(w2v_path) | |
import io | |
# Vector file, `\t` seperated the vectors and `\n` seperate the words | |
""" | |
0.1\t0.2\t0.5\t0.9 | |
0.2\t0.1\t5.0\t0.2 | |
0.4\t0.1\t7.0\t0.8 | |
""" | |
out_v = io.open('vecs.tsv', 'w', encoding='utf-8') | |
# Meta data file, `\n` seperated word | |
""" | |
token1 | |
token2 | |
token3 | |
""" | |
out_m = io.open('meta.tsv', 'w', encoding='utf-8') | |
# Write meta file and vector file | |
for index in range(len(w2v.index2word)): | |
word = w2v.index2word[index] | |
vec = w2v.vectors[index] | |
out_m.write(word + "\n") | |
out_v.write('\t'.join([str(x) for x in vec]) + "\n") | |
out_v.close() | |
out_m.close() | |
# Then we can visuale using the `http://projector.tensorflow.org/` to visualize those two files. | |
# 1. Open the Embedding Projector. | |
# 2. Click on "Load data". | |
# 3. Upload the two files we created above: vecs.tsv and meta.tsv. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# encoding: utf-8 | |
""" | |
@author: BrikerMan | |
@contact: [email protected] | |
@blog: https://eliyar.biz | |
@version: 1.0 | |
@license: Apache Licence | |
@file: w2v_visualizer.py | |
@time: 2017/7/30 上午9:37 | |
""" | |
import sys | |
import os | |
import pathlib | |
import numpy as np | |
from gensim.models.keyedvectors import KeyedVectors | |
import tensorflow as tf | |
from tensorflow.contrib.tensorboard.plugins import projector | |
def visualize(model, output_path): | |
meta_file = "w2x_metadata.tsv" | |
placeholder = np.zeros((len(model.wv.index2word), model.vector_size)) | |
with open(os.path.join(output_path, meta_file), 'wb') as file_metadata: | |
for i, word in enumerate(model.wv.index2word): | |
placeholder[i] = model[word] | |
# temporary solution for https://github.com/tensorflow/tensorflow/issues/9094 | |
if word == '': | |
print("Emply Line, should replecaed by any thing else, or will cause a bug of tensorboard") | |
file_metadata.write("{0}".format('<Empty Line>').encode('utf-8') + b'\n') | |
else: | |
file_metadata.write("{0}".format(word).encode('utf-8') + b'\n') | |
# define the model without training | |
sess = tf.InteractiveSession() | |
embedding = tf.Variable(placeholder, trainable=False, name='w2x_metadata') | |
tf.global_variables_initializer().run() | |
saver = tf.train.Saver() | |
writer = tf.summary.FileWriter(output_path, sess.graph) | |
# adding into projector | |
config = projector.ProjectorConfig() | |
embed = config.embeddings.add() | |
embed.tensor_name = 'w2x_metadata' | |
embed.metadata_path = meta_file | |
# Specify the width and height of a single thumbnail. | |
projector.visualize_embeddings(writer, config) | |
saver.save(sess, os.path.join(output_path, 'w2x_metadata.ckpt')) | |
print('Run `tensorboard --logdir={0}` to run visualize result on tensorboard'.format(output_path)) | |
if __name__ == "__main__": | |
""" | |
Use model.save_word2vec_format to save w2v_model as word2evc format | |
Then just run `python w2v_visualizer.py word2vec.text visualize_result` | |
""" | |
try: | |
model_path = sys.argv[1] | |
output_path = sys.argv[2] | |
except: | |
print("Please provice model path and output path") | |
model = KeyedVectors.load_word2vec_format(model_path) | |
pathlib.Path(output_path).mkdir(parents=True, exist_ok=True) | |
visualize(model, output_path) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Any help with this issue would be much appreciated.
I have saved my word2vec model as below
While running
python w2v_visualizer-v2.py
I am getting the following errorNote that my saves file has the following structure
I have also tried to use bin file but didn't work.
I am also getting the same error with
python -m gensim.scripts.word2vec2tensor -i saves -o saves-tf-projector
.