Last active
March 30, 2022 20:26
-
-
Save BrikerMan/7bd4e4bd0a00ac9076986148afc06507 to your computer and use it in GitHub Desktop.
Convert gensim word2vec to tensorboard visualized model, detail: https://eliyar.biz/using-pre-trained-gensim-word2vector-in-a-keras-model-and-visualizing/
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from gensim.models import KeyedVectors | |
# Load gensim word2vec | |
w2v_path = '<Gensim File Path>' | |
w2v = KeyedVectors.load_word2vec_format(w2v_path) | |
import io | |
# Vector file, `\t` seperated the vectors and `\n` seperate the words | |
""" | |
0.1\t0.2\t0.5\t0.9 | |
0.2\t0.1\t5.0\t0.2 | |
0.4\t0.1\t7.0\t0.8 | |
""" | |
out_v = io.open('vecs.tsv', 'w', encoding='utf-8') | |
# Meta data file, `\n` seperated word | |
""" | |
token1 | |
token2 | |
token3 | |
""" | |
out_m = io.open('meta.tsv', 'w', encoding='utf-8') | |
# Write meta file and vector file | |
for index in range(len(w2v.index2word)): | |
word = w2v.index2word[index] | |
vec = w2v.vectors[index] | |
out_m.write(word + "\n") | |
out_v.write('\t'.join([str(x) for x in vec]) + "\n") | |
out_v.close() | |
out_m.close() | |
# Then we can visuale using the `http://projector.tensorflow.org/` to visualize those two files. | |
# 1. Open the Embedding Projector. | |
# 2. Click on "Load data". | |
# 3. Upload the two files we created above: vecs.tsv and meta.tsv. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# encoding: utf-8 | |
""" | |
@author: BrikerMan | |
@contact: [email protected] | |
@blog: https://eliyar.biz | |
@version: 1.0 | |
@license: Apache Licence | |
@file: w2v_visualizer.py | |
@time: 2017/7/30 上午9:37 | |
""" | |
import sys | |
import os | |
import pathlib | |
import numpy as np | |
from gensim.models.keyedvectors import KeyedVectors | |
import tensorflow as tf | |
from tensorflow.contrib.tensorboard.plugins import projector | |
def visualize(model, output_path): | |
meta_file = "w2x_metadata.tsv" | |
placeholder = np.zeros((len(model.wv.index2word), model.vector_size)) | |
with open(os.path.join(output_path, meta_file), 'wb') as file_metadata: | |
for i, word in enumerate(model.wv.index2word): | |
placeholder[i] = model[word] | |
# temporary solution for https://github.com/tensorflow/tensorflow/issues/9094 | |
if word == '': | |
print("Emply Line, should replecaed by any thing else, or will cause a bug of tensorboard") | |
file_metadata.write("{0}".format('<Empty Line>').encode('utf-8') + b'\n') | |
else: | |
file_metadata.write("{0}".format(word).encode('utf-8') + b'\n') | |
# define the model without training | |
sess = tf.InteractiveSession() | |
embedding = tf.Variable(placeholder, trainable=False, name='w2x_metadata') | |
tf.global_variables_initializer().run() | |
saver = tf.train.Saver() | |
writer = tf.summary.FileWriter(output_path, sess.graph) | |
# adding into projector | |
config = projector.ProjectorConfig() | |
embed = config.embeddings.add() | |
embed.tensor_name = 'w2x_metadata' | |
embed.metadata_path = meta_file | |
# Specify the width and height of a single thumbnail. | |
projector.visualize_embeddings(writer, config) | |
saver.save(sess, os.path.join(output_path, 'w2x_metadata.ckpt')) | |
print('Run `tensorboard --logdir={0}` to run visualize result on tensorboard'.format(output_path)) | |
if __name__ == "__main__": | |
""" | |
Use model.save_word2vec_format to save w2v_model as word2evc format | |
Then just run `python w2v_visualizer.py word2vec.text visualize_result` | |
""" | |
try: | |
model_path = sys.argv[1] | |
output_path = sys.argv[2] | |
except: | |
print("Please provice model path and output path") | |
model = KeyedVectors.load_word2vec_format(model_path) | |
pathlib.Path(output_path).mkdir(parents=True, exist_ok=True) | |
visualize(model, output_path) |
may i know which tensorflow version do you use ?
1.14.0, 1.15.0
@sherringzhang try the v2 version, no need to use tensorflow.
Thanks!
Any help with this issue would be much appreciated.
I have saved my word2vec model as below
import gensim
import tensorflow
cbow = gensim.models.Word2Vec.load("word2vec-cbow-trained.model")
cbow.wv.save_word2vec_format("saves", binary=False, write_header=False)
While running python w2v_visualizer-v2.py
I am getting the following error
Traceback (most recent call last):
File "w2v_visualizer-v2.py", line 5, in <module>
w2v = KeyedVectors.load_word2vec_format(w2v_path)
File "/opt/anaconda3/lib/python3.7/site-packages/gensim/models/keyedvectors.py", line 1632, in load_word2vec_format
limit=limit, datatype=datatype, no_header=no_header,
File "/opt/anaconda3/lib/python3.7/site-packages/gensim/models/keyedvectors.py", line 1903, in _load_word2vec_format
vocab_size, vector_size = [int(x) for x in header.split()] # throws for invalid file format
File "/opt/anaconda3/lib/python3.7/site-packages/gensim/models/keyedvectors.py", line 1903, in <listcomp>
vocab_size, vector_size = [int(x) for x in header.split()] # throws for invalid file format
ValueError: invalid literal for int() with base 10: 'the'
Note that my saves file has the following structure
the 0.17685317 -1.9532617 1.5539714 0.25966546 -0.2790604 -2.2597713 1.4972197 0.10902006 1.2890892 -3.6024532 -2.989032 0.34657598 0.7236928 -2.42964 -0.8789496 -1.7872599 -0.44978893 0.60331094 3.6287258 -3.4678683 0.44643888 -2.4881196 -0.4031334 1.1506665 0.85839635 -0.074375466 0.8728451 -0.7151794 0.1678528 -0.799362 0.6072906 -1.508202 0.25062883 -0.86469465 -1.2192433 -1.1580321 -1.9778714 -1.3886946 1.4056126 -0.8735327 0.35347122 -0.86999214 -0.45525596 -0.42418057 -1.059203 1.8854493 0.15196545 -0.6945364 1.1803597 0.37946966 1.4427571 -1.1902769 -1.9924257 -1.8601042 1.4197625 4.008973 -1.056887 -2.7531395 -3.6723409 0.7010601 1.7005075 1.0070549 -1.214978 -0.7066994 -2.7889273 -1.3718377 -4.142946 2.6858153 -0.43708274 -2.88627 -1.2278798 0.50436705 -1.9579133 1.8198298 1.4223633 -0.7244085 -0.6265867 -1.4257486 -2.4365761 1.2560792 0.038024887 -1.0622818 -0.61398774 1.3327323 0.2054318 -0.64971733 -3.3743927 1.5060719 3.2805905 0.7192133 -1.5339907 1.2716265 0.1824157 -1.0124737 0.050999433 1.4663799 -1.1016992 -1.2143117 -2.4511175 -0.10557431
of 3.0460706 1.3373663 1.196816 -0.19556047 -0.8811049 -3.2064245 2.7241864 -0.7885825 0.9971016 -1.9416411 -1.2840354 -0.34925482 -0.45565912 -1.5531577 1.5190945 1.3647329 -1.002623 1.2740299 -0.4051151 -1.9208825 2.4888372 -1.7555032 -3.0612128 1.8195832 2.4814606 1.7883114 -2.1521654 -1.0710764 -1.5177658 -0.028148778 1.1640902 -1.8277678 0.15703708 -1.4029969 -2.1030195 1.4702427 0.4488853 1.5654148 3.3290143 -0.7383135 -1.8343161 -0.68447036 -0.043044392 0.26769924 -0.40439782 -0.18810375 -1.1142554 -0.46278685 0.24317823 -1.2461005 -1.8464402 -1.2612877 -0.8662675 2.1412709 -0.13359068 3.4847245 0.057869047 -2.790783 -2.7549887 -0.6750809 2.1154828 -1.2982407 0.10796748 2.1213553 -1.436385 2.1015587 -1.6171037 2.3242743 0.7234685 -1.8758755 -1.4552547 2.0747006 -1.8497071 -1.2190679 2.581521 0.15189165 0.52188194 -1.6772207 -1.1163213 1.2301515 2.924736 -0.7979048 1.7843307 0.90919286 0.84954226 -1.6375353 -0.97743297 1.2929957 2.3574789 -0.80276364 -1.1635864 1.6928878 -2.0798647 -2.6221583 -1.0339357 0.3908198 0.84976274 -2.3748333 -1.5945665 0.39038292
and 1.1219313 -0.57287216 2.8233395 2.1360872 0.7092419 -1.4452188 0.056862712 -0.8650348 -1.2400309 -1.7934142 2.912096 -0.8492032 -1.3192874 -2.6795244 -0.9867547 -0.32589948 0.5089368 -3.0507712 0.05717876 0.9776794 0.26174447 0.3085299 -0.1887419 2.034681 2.9300737 -1.4973634 -0.23923938 -2.1425972 1.4168317 -1.8936833 0.20425037 -0.76487523 -0.36804277 1.9585862 -0.33642375 -0.39844537 1.2265253 2.8116388 -0.079485364 -0.82307726 0.7894203 0.79567325 -1.8632652 1.388837 -2.1622436 0.42561036 -0.2638307 0.7827761 -2.921315 -0.85548234 -0.37340742 -1.8249861 -0.23272651 -0.23085229 -0.9447633 2.861003 -0.6493189 -0.5078093 -4.5068545 -0.09463242 1.7309222 2.2277017 1.6091677 -2.0859606 -2.2346895 0.62350595 -0.33926424 3.4101918 -0.43544188 -0.42850614 -0.34875816 0.9752414 -2.7059336 -0.83867943 -1.4181806 0.039321333 -1.5695908 -2.3829415 -3.8533163 -0.15862149 1.3913755 -0.7142783 -1.4000473 0.6372149 0.07471319 -0.15135452 -2.1134727 1.186361 1.4265207 2.1556866 -3.1364825 0.6773747 0.19556485 -0.2499766 0.0012382956 0.73377067 0.24287009 -0.25204325 -2.0515716 -0.8586815
in 0.4679107 -0.7147363 0.6870513 2.3970296 0.23133335 -2.0088308 0.96192807 -0.70336884 2.7202427 -3.295829 -0.111902796 0.84575987 -2.091557 -1.4566486 -1.2371515 -0.46427134 -0.57429343 -0.8059568 -1.232483 0.8712659 1.7126014 -1.8753738 -0.4192433 1.6890231 1.4969786 -3.532354 -3.5461607 1.8652811 -1.9688314 -2.7284658 1.1998934 -2.7128458 -0.14551273 1.485711 -2.0984075 -0.11708992 0.3388792 0.92114073 0.4436177 1.2787954 -0.43780327 -1.2907416 -3.675929 0.19476354 1.2023387 0.21112837 -0.048462275 0.5956885 -2.4125252 0.5563255 0.67015076 -3.0510373 -2.4840655 -0.9940212 -0.19181994 2.089644 -0.67003036 0.13005632 -3.5958734 1.4169023 2.415284 0.6011981 1.6696676 0.94127685 0.12856282 -1.1206942 0.44963643 3.4322197 1.6712409 -2.8097441 -0.056743283 0.885703 0.22115222 1.0445929 1.0310951 -0.17957939 -0.47198862 0.6815261 1.1931702 1.0212876 1.3147619 0.5675771 0.23508233 0.7991027 -0.93111813 -1.7462871 -0.37885237 0.8413612 5.558045 -0.12530512 -0.45482203 0.9609325 -2.6102614 -2.2339709 -1.5555964 1.4071726 -0.7291663 -3.266579 -0.319394 -0.9037868
to -3.8821414 4.21567 -4.2841916 1.9992005 8.387515 -1.866365 -1.7139552 1.5717068 -3.7546477 -6.2972994 -4.429308 0.3561993 -2.228322 -6.24198 -4.754417 -2.489323 0.33810374 1.5772519 5.625847 0.18966335 0.02501753 1.1203367 2.4982362 4.1448264 -3.023561 -1.5638204 -3.850069 -2.7785816 -1.1669389 2.6075501 -1.1107177 0.020647109 0.6798677 1.9573961 2.033776 2.4727473 0.34439242 -3.4267285 -3.4618797 -1.0767714 1.4911493 0.32205847 -3.8282282 0.46621034 -1.1868904 -0.804156 -2.0668411 -1.1781965 1.4468315 -1.9320458 2.4679568 -0.64231515 -0.9775714 1.0633377 0.9612057 2.0127504 -3.4997683 -4.415362 -7.259522 -0.71372855 -2.7171516 2.8856084 0.9670163 3.0653586 -2.1910143 -0.79722583 -0.74901676 6.434015 0.43257618 -6.0027103 -2.4667065 6.653222 -2.670447 -2.8447676 1.4860531 3.540087 4.0157084 2.0935838 2.4390619 -1.7799611 -2.3248808 -5.28272 -3.1426811 -1.1940547 -5.5037665 -7.4333463 -0.8022476 4.4970007 2.454048 2.0067215 -3.1876593 3.0883808 -0.23921992 1.9851035 -0.62840044 -2.6010897 -0.73158085 2.1242673 -3.8213096 2.5947778
was -4.710589 -3.3772073 0.10955221 -1.7313877 -8.288787 0.05409461 -6.080295 2.1534183 1.5065368 4.335596 -3.5250914 -0.3277998 0.4277706 3.5720081 1.3879797 -6.002331 2.2638056 0.15390304 1.8505913 -5.4874144 5.445016 0.6296176 -1.4566065 1.9769683 -7.294601 1.8752729 -3.8635497 2.6733608 -5.9826574 -2.3535771 -1.3837878 -6.3405747 2.4270785 7.0047126 2.188791 -2.5511227 -1.1451485 -5.0530715 -3.4014251 -2.576176 2.474123 -2.334116 -2.4481807 -2.2739136 3.7661273 4.3733006 -0.45788652 3.7542331 4.0064507 5.5941634 4.0001907 2.9434066 -1.6101458 -0.421224 -1.0708168 1.3551916 -0.9656049 -2.3622172 -1.9253881 0.5219556 3.9598243 5.149903 0.018789774 -0.2614307 6.4478564 -1.781651 2.080866 -5.375572 0.37850425 -1.2678552 -1.1856774 -3.4277577 -0.9645903 4.3355446 1.5985847 -0.2640619 2.1373053 2.773614 5.2806582 2.6934717 3.48066 -0.84807503 1.4767468 5.426377 -0.36306188 0.4825673 -1.2568187 4.875955 0.048968043 -2.151128 7.0404983 -0.026978265 0.9373621 -1.8621575 -5.954047 -1.558388 -3.4180856 -1.6886758 0.6581081 -1.2771552
is -2.4761403 -4.745872 -0.52507603 -4.8229136 -0.52402735 -0.30308694 -5.4508224 1.2384114 0.031921666 9.79006 -8.656783 -2.1981444 2.819296 5.0073905 -0.03273342 -1.577312 0.39155224 4.332872 3.1937835 -5.093693 3.7426393 -2.9764054 -3.7875974 5.7265434 -2.3378108 3.3960438 -6.4622827 1.3165145 -4.0487742 -1.8943297 -3.8854945 -1.9383514 3.5735753 4.4026666 2.4589038 2.4062846 -1.062349 -4.792357 -0.50390446 -5.6058297 2.7092764 -4.3803024 5.283212 0.08074169 1.648946 3.0423262 -6.310893 2.5328448 4.377649 -0.3310825 7.1264014 -5.306405 0.005553321 3.1993332 0.034337055 4.294785 1.8776917 -2.10396 -4.286922 -5.786969 6.3240724 -1.5837121 3.9194334 -2.2896976 2.170305 -0.42724505 -4.6715894 -5.27865 4.3029084 -0.74100006 -2.161638 -4.2911134 0.24633707 6.129507 3.0226285 -2.8621705 -0.694486 2.2658727 1.273172 4.868444 3.1779702 3.3881705 -1.6068625 2.3613548 2.3039763 1.0724328 2.4482958 3.3090308 -3.7438765 0.93811786 7.2100186 2.3706846 -0.030542172 2.9104407 -2.1533716 -1.487372 4.3187037 -3.0258844 -2.2990308 -4.9696674
for 1.3992636 1.5832471 -0.8895734 5.8175335 0.86994433 -0.34137616 1.6965578 0.35610542 -1.8316814 -2.5547972 -1.3647654 -1.2454106 1.1643976 -2.7216268 -1.2922747 -0.55117863 -1.3940042 0.82272893 -0.774639 -0.32584584 -0.93710077 1.6914349 -3.212801 1.291236 2.0006056 -1.2454157 -6.143175 -0.50502014 0.35242376 -1.3001075 0.2543482 -4.1115746 1.4173106 0.10958401 -2.9831893 1.5833054 0.9224081 0.2791168 -0.7371618 1.0639977 1.3920257 -1.8556877 -2.1440988 -1.3849066 1.788669 0.9723513 -2.2321274 3.2100303 -0.39172065 -3.3361773 0.3151164 -2.193909 4.508344 -0.4285582 0.74758065 0.5673972 -2.54101 4.3181605 -6.2061315 3.6213741 1.5109493 -0.060880948 2.5642757 1.9259039 -1.1053112 -0.9859321 -2.713804 1.0688162 -2.3088155 -4.6221027 -0.4982548 -0.3131879 -0.33689672 0.72330076 0.97918767 -0.14906633 4.8335185 -0.5903869 0.2768676 -2.192907 1.3851609 -0.5208601 1.0543551 -0.8356508 -3.2071674 -1.743094 -1.2873502 -0.7846164 1.4788506 2.2974634 -1.6659901 -0.6453968 -3.0161443 -2.2829316 0.5852248 -0.68889207 2.1304712 0.79754585 -2.0810611 2.707873
as -3.2138474 -1.3737184 -0.4900845 2.3804321 0.20644596 -4.7914643 -0.7833128 -2.0857232 -1.3549345 3.6720624 -4.5230885 1.8241613 1.2056868 -3.6549199 0.062405944 0.73604953 -3.194924 -3.3431904 1.0623393 -2.5494397 -3.9093316 -0.44945338 -1.3500556 1.0840981 5.4734097 0.5995898 -2.6338716 -2.935244 0.021404946 -4.6726513 1.904343 -2.0336611 4.1603155 3.9126804 -0.3598954 5.5134354 0.16425803 -1.1597283 -1.3014913 -2.0786629 -1.7961179 -1.0467496 -2.4315042 4.693773 -3.9579918 2.5038486 -1.5581553 2.9967482 -1.3689312 3.024006 3.5176284 -7.4987574 -0.13272764 -0.69940466 2.7831237 -2.0079074 -1.3492281 4.199071 -5.296896 1.6323377 1.0093807 -1.986234 6.2950144 3.50566 0.5140228 -0.69193727 -4.1022754 1.310019 -0.8469194 -2.8981595 -1.8818659 1.3815243 1.3265257 3.5771966 -4.446949 0.944633 0.31574664 -4.452766 -2.1552684 2.4903722 0.7287854 -6.098524 1.0507023 1.452695 3.2640574 -0.8936203 -2.1345525 2.2261577 2.0795228 4.387568 2.726179 1.4903517 -0.58494467 -2.617 1.6112714 2.270486 -4.3453274 1.3762197 -0.31027853 1.0753359
. . .
I have also tried to use bin file but didn't work.
I am also getting the same error with python -m gensim.scripts.word2vec2tensor -i saves -o saves-tf-projector
.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
may i know which tensorflow version do you use ?