Skip to content

Instantly share code, notes, and snippets.

@AashishTiwari
Created November 18, 2016 12:59
Show Gist options
  • Save AashishTiwari/bf163429ce62ace967c10bfab4c2122c to your computer and use it in GitHub Desktop.
Save AashishTiwari/bf163429ce62ace967c10bfab4c2122c to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Polyglot Demo.\n",
"\n",
"### Notebook by [Aashish K Tiwari]\n",
"#### [Persistent Systems Ltd]\n",
"#### Data Source: https://sites.google.com/site/rmyeid/projects/polyglot"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Table of contents\n",
"\n",
"\n",
"1. [Step 1: Loading Embeddings](#Step-1:-loading-embeddings)\n",
"\n",
"2. [Step 2: Analyzing](#Step-2:-Analyzing)\n",
"\n",
"3. [Step 3: Similarity](#Step-3:-Similarity)\n",
"\n",
"4. [Step 4: Demo](#Step-4:-Demo)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Step 1: Loading Embeddings\n",
"\n",
"[[ go back to the top ]](#Table-of-contents)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Using the hindi embeddings to demonstrate hindi words"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"import pickle\n",
"import numpy\n",
"words, embeddings = pickle.load(open('polyglot-hi.pkl', 'rb'))"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Emebddings shape is (94004, 64)\n"
]
}
],
"source": [
"print(\"Emebddings shape is {}\".format(embeddings.shape))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Step 2: Analyzing\n",
"\n",
"[[ go back to the top ]](#Table-of-contents)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"(u'</S>',\n",
" u'<UNK>',\n",
" u'<PAD>',\n",
" u'<S>',\n",
" u'\\u0915\\u0947',\n",
" u',',\n",
" u'\\u0939\\u0948',\n",
" u'\\u092e\\u0947\\u0902',\n",
" u'\\u0964',\n",
" u'\\u0915\\u0940')"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"words[:10]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"We can see the stop words in the file"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"के\n"
]
}
],
"source": [
"print(u'\\u0915\\u0947')"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"है\n"
]
}
],
"source": [
"print(u'\\u0939\\u0948')"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"में\n"
]
}
],
"source": [
"print(u'\\u092e\\u0947\\u0902')"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"u'\\u092a\\u094d\\u0930\\u0925\\u092e'"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"'प्रथम'.decode('utf-8')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Stop word tokens"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"Token_ID = {\"</S>\": 0, \"<UNK>\": 1, \"<PAD>\":2, \"<S>\": 3}"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Step 3: Similarity\n",
"\n",
"[[ go back to the top ]](#Table-of-contents)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## K-Nearest Neighbors"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"\"\"\"KNN Example.\"\"\"\n",
"\n",
"from operator import itemgetter\n",
"from itertools import izip, islice\n",
"import re\n",
"import numpy\n",
"\n",
"def normalize(word, word_id):\n",
" \"\"\" Find the closest alternative in case the word is OOV.\"\"\"\n",
" if not word in word_id:\n",
" word = DIGITS.sub(\"#\", word)\n",
" if not word in word_id:\n",
" return None\n",
" return word\n",
"\n",
"\n",
"def l2_nearest(embeddings, word_index, k):\n",
" \"\"\"Sorts words according to their Euclidean distance.\n",
" To use cosine distance, embeddings has to be normalized so that their l2 norm is 1.\"\"\"\n",
"\n",
" e = embeddings[word_index]\n",
" distances = (((embeddings - e) ** 2).sum(axis=1) ** 0.5)\n",
" sorted_distances = sorted(enumerate(distances), key=itemgetter(1))\n",
" return zip(*sorted_distances[:k])\n",
"\n",
"\n",
"def knn(word, embeddings, word_id, id_word):\n",
" word = normalize(word, word_id)\n",
" if not word:\n",
" print(\"OOV word\")\n",
" return\n",
" word_index = word_id[word]\n",
" indices, distances = l2_nearest(embeddings, word_index, k)\n",
" neighbors = [id_word[idx] for idx in indices]\n",
" nearby_words = []\n",
" for i, (word, distance) in enumerate(izip(neighbors, distances)):\n",
" nearby_words.append(\"\\t\\t\".join([str(i), word, str(distance)]))\n",
" print i, '\\t', word, '\\t\\t', distance\n",
" return nearby_words\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Step 4: Demo\n",
"\n",
"[[ go back to the top ]](#Table-of-contents)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"CONTEXT_MAP = {\n",
" \"English\" : \"polyglot-en.pkl\",\n",
" \"Hindi\" : \"polyglot-hi.pkl\",\n",
" \"Marathi\" : \"polyglot-mr.pkl\",\n",
" \"Assamese\" : \"polyglot-as.pkl\",\n",
" \"Tamil\" : \"polyglot-ta.pkl\",\n",
" \"Telugu\" : \"polyglot-te.pkl\",\n",
" \"Malayalam\" : \"polyglot-ml.pkl\",\n",
" \"Bengali\" : \"polyglot-bn.pkl\"\n",
"}"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# Number of neighbors to return.\n",
"k = 10"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0 \tবাড়ি \t\t0.0\n",
"1 \tকার্যক্রম \t\t0.71795\n",
"2 \tকণ্ঠ \t\t0.835101\n",
"3 \tকবর \t\t0.836846\n",
"4 \tমেয়র \t\t0.844205\n",
"5 \tযাত্রা \t\t0.846311\n",
"6 \tখবর \t\t0.852814\n",
"7 \tহাদীস \t\t0.859106\n",
"8 \tমূল্য \t\t0.863202\n",
"9 \tঘরে \t\t0.863675\n"
]
}
],
"source": [
"%matplotlib inline\n",
"from ipywidgets import widgets, interact\n",
"from IPython.display import display\n",
"\n",
"# Dropdown box:\n",
"columns = sorted(CONTEXT_MAP.keys())\n",
"selection = widgets.Dropdown(description=\"Please select a language\")\n",
"selection.options = columns\n",
"display(selection)\n",
"\n",
"# Input text box:\n",
"\n",
"input_text=widgets.Text(\n",
" value='Obama',\n",
" placeholder='Type something',\n",
" description='Input Word:',\n",
" disabled=False\n",
")\n",
"input_text.layout.align_items='flex-start'\n",
"display(input_text)\n",
"\n",
"# Output text box\n",
"\n",
"out_text=widgets.Text(\n",
" placeholder='results',\n",
" description='Output:',\n",
" disabled=False\n",
")\n",
"\n",
"#Button\n",
"\n",
"button = widgets.Button(\n",
" description='Search',\n",
" disabled=False,\n",
" button_style='danger',\n",
" tooltip='find',\n",
" icon='check'\n",
")\n",
"display(button)\n",
"\n",
"def get_embeddings_file_from_context(context):\n",
" return CONTEXT_MAP[context]\n",
"\n",
"def on_button_clicked(b):\n",
" embed_file = get_embeddings_file_from_context(selection.value)\n",
" words, embeddings = pickle.load(open(embed_file, 'rb'))\n",
" in_word = input_text.value\n",
" \n",
" ID_Token = {v:k for k,v in Token_ID.iteritems()}\n",
"\n",
" # Map words to indices and vice versa\n",
" word_id = {w:i for (i, w) in enumerate(words)}\n",
" id_word = dict(enumerate(words))\n",
"\n",
" # Noramlize digits by replacing them with #\n",
" DIGITS = re.compile(\"[0-9]\", re.UNICODE)\n",
"\n",
" nearby = knn(in_word,embeddings, word_id, id_word)\n",
" out_text = '\\n'.join(nearby)\n",
"# display(out_text)\n",
"# knn(in_word,embeddings, word_id, id_word)\n",
"\n",
"button.on_click(on_button_clicked)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"anaconda-cloud": {},
"kernelspec": {
"display_name": "Python [default]",
"language": "python",
"name": "python2"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.12"
},
"widgets": {
"state": {
"1f12b998a3f7427c85eba50396be46e7": {
"views": [
{
"cell_index": 21
}
]
},
"a17a9c2a9031400a8d6b5d1dffcd1854": {
"views": [
{
"cell_index": 21
}
]
},
"ad1560ae0848463ba9089ba0d226fd0f": {
"views": [
{
"cell_index": 20
}
]
},
"ee6d6fca82da413f8839b4c4b54a840b": {
"views": [
{
"cell_index": 21
}
]
}
},
"version": "1.2.0"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment