Last active
July 5, 2016 00:14
-
-
Save moonmilk/691cb5c4d824f65d5e9b0eb77c5d0dca to your computer and use it in GitHub Desktop.
messing around with ofxMSAWord2Vec
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# messing with oxfMSAWord2Vec\n", | |
"from https://github.com/memo/ofxMSAWord2Vec\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"load_word_vectors_bin : /Users/ranjit/Downloads/GoogleNews-vectors-negative300_trimmed_53K_lowercase.bin ... \n", | |
"num_words: 53084/53084\n", | |
"num_dims: 300\n", | |
"done in 7.57633709908 seconds.\n", | |
"------------------------------------------------------------\n", | |
"normalize_word_vectors ... done in 0.50585103035 seconds.\n" | |
] | |
} | |
], | |
"source": [ | |
"from word2vec_utils import *\n", | |
"vecs = load_word_vectors_bin('/Users/ranjit/Downloads/GoogleNews-vectors-negative300_trimmed_53K_lowercase.bin')\n", | |
"vecs_n = normalize_word_vectors(vecs)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"[('dogs', 0.86804897),\n", | |
" ('puppy', 0.81064284),\n", | |
" ('cat', 0.7609458),\n", | |
" ('beagle', 0.74186218),\n", | |
" ('pup', 0.74069107),\n", | |
" ('chihuahua', 0.71739173),\n", | |
" ('pet', 0.71647859),\n", | |
" ('canine', 0.69182897),\n", | |
" ('collie', 0.67144096),\n", | |
" ('kitten', 0.66598809)]" | |
] | |
}, | |
"execution_count": 2, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"find_closest_words(vecs_n, \"dog\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"[('papillon', 0.69635022),\n", | |
" ('chihuahua', 0.6762343),\n", | |
" ('dalmatian', 0.65920705),\n", | |
" ('pug', 0.64561403),\n", | |
" ('puppy', 0.64243448),\n", | |
" ('labrador', 0.63804096),\n", | |
" ('mastiff', 0.62263489),\n", | |
" ('poodle', 0.62242281),\n", | |
" ('beagle', 0.62123823),\n", | |
" ('alsatian', 0.6157546)]" | |
] | |
}, | |
"execution_count": 8, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"find_closest_words(vecs_n, \"pomeranian\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"[('pomeranian', 1.0),\n", | |
" ('papillon', 0.69635022),\n", | |
" ('chihuahua', 0.6762343),\n", | |
" ('dalmatian', 0.65920705),\n", | |
" ('pug', 0.64561403),\n", | |
" ('puppy', 0.64243448),\n", | |
" ('labrador', 0.63804096),\n", | |
" ('mastiff', 0.62263489),\n", | |
" ('poodle', 0.62242281),\n", | |
" ('beagle', 0.62123823)]" | |
] | |
}, | |
"execution_count": 10, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"pom_n = vecs_n[\"pomeranian\"]\n", | |
"find_closest_words(vecs_n, pom_n)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 14, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"metal_words = \"burn cries veins eternity breathe beast gonna demons ashes soul\".split(\" \")\n", | |
"unmetal_words = \"particularly indicated secretary committee university relatively noted approximately chairman employees\".split(\" \")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 17, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"metal_vecs, metal_vecs_n = [vecs[word] for word in metal_words], [vecs_n[word] for word in metal_words]\n", | |
"unmetal_vecs, unmetal_vecs_n = [vecs[word] for word in unmetal_words], [vecs_n[word] for word in unmetal_words]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 27, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"([('hell', 0.58362615),\n", | |
" ('eateth', 0.54645419),\n", | |
" ('souls', 0.54323936),\n", | |
" ('looketh', 0.53242099),\n", | |
" ('god', 0.52718186)],\n", | |
" [('said', 0.53903073),\n", | |
" ('acknowledged', 0.49876258),\n", | |
" ('stressed', 0.48518729),\n", | |
" ('emphasized', 0.46537137),\n", | |
" ('committees', 0.46324104)])" | |
] | |
}, | |
"execution_count": 27, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# find closest words to average word\n", | |
"do_word_maths(vecs, vecs_n, [(0.1, word) for word in metal_words]),do_word_maths(vecs, vecs_n, [(0.1, word) for word in unmetal_words])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 41, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"normalize_word_vectors ... done in 5.19752502441e-05 seconds.\n" | |
] | |
} | |
], | |
"source": [ | |
"# find average vector\n", | |
"import numpy as np\n", | |
"metal_mean = np.mean(metal_vecs, axis=0)\n", | |
"metal_mean_n = normalize_word_vectors({'metal_af':metal_mean})['metal_af']\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 43, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"[('soul', 0.63606858),\n", | |
" ('demons', 0.60098231),\n", | |
" ('hell', 0.58362621),\n", | |
" ('eateth', 0.54645419),\n", | |
" ('souls', 0.54323936),\n", | |
" ('beast', 0.53437954),\n", | |
" ('looketh', 0.53242099),\n", | |
" ('eternity', 0.52865142),\n", | |
" ('breathe', 0.52801883),\n", | |
" ('god', 0.52718186)]" | |
] | |
}, | |
"execution_count": 43, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"find_closest_words(vecs_n, metal_mean_n)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 44, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"normalize_word_vectors ... done in 3.79085540771e-05 seconds.\n" | |
] | |
} | |
], | |
"source": [ | |
"unmetal_mean = np.mean(unmetal_vecs, axis=0)\n", | |
"unmetal_mean_n = normalize_word_vectors({'unmetal_af':unmetal_mean})['unmetal_af']" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 45, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"[('chairman', 0.5799759),\n", | |
" ('noted', 0.55365336),\n", | |
" ('committee', 0.54178369),\n", | |
" ('said', 0.53903079),\n", | |
" ('secretary', 0.52612215),\n", | |
" ('indicated', 0.50970483),\n", | |
" ('acknowledged', 0.49876261),\n", | |
" ('stressed', 0.48518729),\n", | |
" ('emphasized', 0.46537143),\n", | |
" ('committees', 0.46324104)]" | |
] | |
}, | |
"execution_count": 45, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"find_closest_words(vecs_n, unmetal_mean_n)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 198, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
" --- less metal --- \n", | |
"concerned\treceptive\tinvolved\tsaid\tstressed\n", | |
"concerned\treceptive\tinvolved\tsaid\tamenable\n", | |
"concerned\treceptive\tinvolved\tamenable\tconsidering\tactively\n", | |
"concerned\treceptive\tinvolved\tamenable\taverse\tactively\tconsider\n", | |
"receptive\tconcerned\tinvolved\taverse\tamenable\tintrigued\tactively\n", | |
"receptive\tinvolved\tconcerned\taverse\tamenable\tintrigued\tconsider\n", | |
"involved\treceptive\tconcerned\taverse\tintrigued\tamenable\tuninterested\n", | |
"involved\treceptive\taverse\tconcerned\tintrigued\tamenable\tuninterested\n", | |
"involved\treceptive\taverse\tintrigued\tconcerned\tamenable\tuninterested\n", | |
"involved\taverse\tintrigued\treceptive\tuninterested\tamenable\tconcerned\n", | |
"interested\tinterested\tinterested\tinterested\tinterested\tinterested\tinterested\tinterested\tinterested\tinterested\n", | |
"averse\tinvolved\tintrigued\treceptive\tuninterested\tenamored\tamenable\n", | |
"intrigued\taverse\tinvolved\tenamored\tuninterested\treceptive\tdesirous\n", | |
"intrigued\taverse\tenamored\tdesirous\tuninterested\tinvolved\treceptive\n", | |
"intrigued\taverse\tenamored\tdesirous\tuninterested\tfond\tinvolved\n", | |
"intrigued\tenamored\taverse\tdesirous\tfond\tlove\tdreaming\n", | |
"love\tintrigued\tenamored\tdreaming\tloves\tdesirous\taverse\n", | |
"love\tmad\tdreaming\tloves\tintrigued\thell\n", | |
"love\thell\tmad\tloves\tdreaming\n", | |
"hell\tlove\tmad\tloves\n", | |
"hell\tlove\tmad\twarn't\n", | |
" --- more metal ---\n" | |
] | |
} | |
], | |
"source": [ | |
"word = 'interested'\n", | |
"inc = 0.03\n", | |
"print \" --- less metal --- \"\n", | |
"def wordtable(w):\n", | |
" return \"\\t\".join([a[0] for a in w])\n", | |
"for f in range(10,0,-1):\n", | |
" print wordtable(do_word_maths(vecs, vecs_n, [(1, word)] +[(inc*f, w) for w in unmetal_words], top_k=8))\n", | |
"print wordtable([(word,) for i in range(0,10)])\n", | |
"for f in range(0,10):\n", | |
" print wordtable(do_word_maths(vecs, vecs_n, [(1, word)] +[(inc*f, w) for w in metal_words], top_k=8))\n", | |
"print \" --- more metal ---\"" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 46, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 2", | |
"language": "python", | |
"name": "python2" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 2 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython2", | |
"version": "2.7.11" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 0 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment