Created
November 18, 2016 12:59
-
-
Save AashishTiwari/bf163429ce62ace967c10bfab4c2122c to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Polyglot Demo.\n", | |
"\n", | |
"### Notebook by [Aashish K Tiwari]\n", | |
"#### [Persistent Systems Ltd]\n", | |
"#### Data Source: https://sites.google.com/site/rmyeid/projects/polyglot" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Table of contents\n", | |
"\n", | |
"\n", | |
"1. [Step 1: Loading Embeddings](#Step-1:-loading-embeddings)\n", | |
"\n", | |
"2. [Step 2: Analyzing](#Step-2:-Analyzing)\n", | |
"\n", | |
"3. [Step 3: Similarity](#Step-3:-Similarity)\n", | |
"\n", | |
"4. [Step 4: Demo](#Step-4:-Demo)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Step 1: Loading Embeddings\n", | |
"\n", | |
"[[ go back to the top ]](#Table-of-contents)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Using the hindi embeddings to demonstrate hindi words" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"import pickle\n", | |
"import numpy\n", | |
"words, embeddings = pickle.load(open('polyglot-hi.pkl', 'rb'))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Emebddings shape is (94004, 64)\n" | |
] | |
} | |
], | |
"source": [ | |
"print(\"Emebddings shape is {}\".format(embeddings.shape))" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Step 2: Analyzing\n", | |
"\n", | |
"[[ go back to the top ]](#Table-of-contents)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"(u'</S>',\n", | |
" u'<UNK>',\n", | |
" u'<PAD>',\n", | |
" u'<S>',\n", | |
" u'\\u0915\\u0947',\n", | |
" u',',\n", | |
" u'\\u0939\\u0948',\n", | |
" u'\\u092e\\u0947\\u0902',\n", | |
" u'\\u0964',\n", | |
" u'\\u0915\\u0940')" | |
] | |
}, | |
"execution_count": 3, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"words[:10]" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"We can see the stop words in the file" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"के\n" | |
] | |
} | |
], | |
"source": [ | |
"print(u'\\u0915\\u0947')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"है\n" | |
] | |
} | |
], | |
"source": [ | |
"print(u'\\u0939\\u0948')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"में\n" | |
] | |
} | |
], | |
"source": [ | |
"print(u'\\u092e\\u0947\\u0902')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"u'\\u092a\\u094d\\u0930\\u0925\\u092e'" | |
] | |
}, | |
"execution_count": 7, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"'प्रथम'.decode('utf-8')" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Stop word tokens" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"Token_ID = {\"</S>\": 0, \"<UNK>\": 1, \"<PAD>\":2, \"<S>\": 3}" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Step 3: Similarity\n", | |
"\n", | |
"[[ go back to the top ]](#Table-of-contents)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## K-Nearest Neighbors" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"\"\"\"KNN Example.\"\"\"\n", | |
"\n", | |
"from operator import itemgetter\n", | |
"from itertools import izip, islice\n", | |
"import re\n", | |
"import numpy\n", | |
"\n", | |
"def normalize(word, word_id):\n", | |
" \"\"\" Find the closest alternative in case the word is OOV.\"\"\"\n", | |
" if not word in word_id:\n", | |
" word = DIGITS.sub(\"#\", word)\n", | |
" if not word in word_id:\n", | |
" return None\n", | |
" return word\n", | |
"\n", | |
"\n", | |
"def l2_nearest(embeddings, word_index, k):\n", | |
" \"\"\"Sorts words according to their Euclidean distance.\n", | |
" To use cosine distance, embeddings has to be normalized so that their l2 norm is 1.\"\"\"\n", | |
"\n", | |
" e = embeddings[word_index]\n", | |
" distances = (((embeddings - e) ** 2).sum(axis=1) ** 0.5)\n", | |
" sorted_distances = sorted(enumerate(distances), key=itemgetter(1))\n", | |
" return zip(*sorted_distances[:k])\n", | |
"\n", | |
"\n", | |
"def knn(word, embeddings, word_id, id_word):\n", | |
" word = normalize(word, word_id)\n", | |
" if not word:\n", | |
" print(\"OOV word\")\n", | |
" return\n", | |
" word_index = word_id[word]\n", | |
" indices, distances = l2_nearest(embeddings, word_index, k)\n", | |
" neighbors = [id_word[idx] for idx in indices]\n", | |
" nearby_words = []\n", | |
" for i, (word, distance) in enumerate(izip(neighbors, distances)):\n", | |
" nearby_words.append(\"\\t\\t\".join([str(i), word, str(distance)]))\n", | |
" print i, '\\t', word, '\\t\\t', distance\n", | |
" return nearby_words\n" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Step 4: Demo\n", | |
"\n", | |
"[[ go back to the top ]](#Table-of-contents)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"CONTEXT_MAP = {\n", | |
" \"English\" : \"polyglot-en.pkl\",\n", | |
" \"Hindi\" : \"polyglot-hi.pkl\",\n", | |
" \"Marathi\" : \"polyglot-mr.pkl\",\n", | |
" \"Assamese\" : \"polyglot-as.pkl\",\n", | |
" \"Tamil\" : \"polyglot-ta.pkl\",\n", | |
" \"Telugu\" : \"polyglot-te.pkl\",\n", | |
" \"Malayalam\" : \"polyglot-ml.pkl\",\n", | |
" \"Bengali\" : \"polyglot-bn.pkl\"\n", | |
"}" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 11, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"# Number of neighbors to return.\n", | |
"k = 10" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 12, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"0 \tবাড়ি \t\t0.0\n", | |
"1 \tকার্যক্রম \t\t0.71795\n", | |
"2 \tকণ্ঠ \t\t0.835101\n", | |
"3 \tকবর \t\t0.836846\n", | |
"4 \tমেয়র \t\t0.844205\n", | |
"5 \tযাত্রা \t\t0.846311\n", | |
"6 \tখবর \t\t0.852814\n", | |
"7 \tহাদীস \t\t0.859106\n", | |
"8 \tমূল্য \t\t0.863202\n", | |
"9 \tঘরে \t\t0.863675\n" | |
] | |
} | |
], | |
"source": [ | |
"%matplotlib inline\n", | |
"from ipywidgets import widgets, interact\n", | |
"from IPython.display import display\n", | |
"\n", | |
"# Dropdown box:\n", | |
"columns = sorted(CONTEXT_MAP.keys())\n", | |
"selection = widgets.Dropdown(description=\"Please select a language\")\n", | |
"selection.options = columns\n", | |
"display(selection)\n", | |
"\n", | |
"# Input text box:\n", | |
"\n", | |
"input_text=widgets.Text(\n", | |
" value='Obama',\n", | |
" placeholder='Type something',\n", | |
" description='Input Word:',\n", | |
" disabled=False\n", | |
")\n", | |
"input_text.layout.align_items='flex-start'\n", | |
"display(input_text)\n", | |
"\n", | |
"# Output text box\n", | |
"\n", | |
"out_text=widgets.Text(\n", | |
" placeholder='results',\n", | |
" description='Output:',\n", | |
" disabled=False\n", | |
")\n", | |
"\n", | |
"#Button\n", | |
"\n", | |
"button = widgets.Button(\n", | |
" description='Search',\n", | |
" disabled=False,\n", | |
" button_style='danger',\n", | |
" tooltip='find',\n", | |
" icon='check'\n", | |
")\n", | |
"display(button)\n", | |
"\n", | |
"def get_embeddings_file_from_context(context):\n", | |
" return CONTEXT_MAP[context]\n", | |
"\n", | |
"def on_button_clicked(b):\n", | |
" embed_file = get_embeddings_file_from_context(selection.value)\n", | |
" words, embeddings = pickle.load(open(embed_file, 'rb'))\n", | |
" in_word = input_text.value\n", | |
" \n", | |
" ID_Token = {v:k for k,v in Token_ID.iteritems()}\n", | |
"\n", | |
" # Map words to indices and vice versa\n", | |
" word_id = {w:i for (i, w) in enumerate(words)}\n", | |
" id_word = dict(enumerate(words))\n", | |
"\n", | |
" # Noramlize digits by replacing them with #\n", | |
" DIGITS = re.compile(\"[0-9]\", re.UNICODE)\n", | |
"\n", | |
" nearby = knn(in_word,embeddings, word_id, id_word)\n", | |
" out_text = '\\n'.join(nearby)\n", | |
"# display(out_text)\n", | |
"# knn(in_word,embeddings, word_id, id_word)\n", | |
"\n", | |
"button.on_click(on_button_clicked)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"anaconda-cloud": {}, | |
"kernelspec": { | |
"display_name": "Python [default]", | |
"language": "python", | |
"name": "python2" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 2 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython2", | |
"version": "2.7.12" | |
}, | |
"widgets": { | |
"state": { | |
"1f12b998a3f7427c85eba50396be46e7": { | |
"views": [ | |
{ | |
"cell_index": 21 | |
} | |
] | |
}, | |
"a17a9c2a9031400a8d6b5d1dffcd1854": { | |
"views": [ | |
{ | |
"cell_index": 21 | |
} | |
] | |
}, | |
"ad1560ae0848463ba9089ba0d226fd0f": { | |
"views": [ | |
{ | |
"cell_index": 20 | |
} | |
] | |
}, | |
"ee6d6fca82da413f8839b4c4b54a840b": { | |
"views": [ | |
{ | |
"cell_index": 21 | |
} | |
] | |
} | |
}, | |
"version": "1.2.0" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 0 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment