Skip to content

Instantly share code, notes, and snippets.

@wenfahu
Created December 6, 2016 03:22
Show Gist options
  • Save wenfahu/0678293c80576ad13085d29485168237 to your computer and use it in GitHub Desktop.
Save wenfahu/0678293c80576ad13085d29485168237 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 45,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"from __future__ import division\n",
"from sklearn.feature_extraction.text import CountVectorizer\n",
"from IPython.display import display\n",
"import numpy as np\n",
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"vectorizer = CountVectorizer(min_df=1, binary=True)"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"d1 = 'human machine interface computer'\n",
"d2 = 'user computer system survey'\n",
"d3 = 'Interface management system user'\n",
"d4 = 'human user machine interface'\n",
"q = 'human user machine interface'\n",
"corpus = [d1, d2, d3, d4, q]"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"[u'computer',\n",
" u'human',\n",
" u'interface',\n",
" u'machine',\n",
" u'management',\n",
" u'survey',\n",
" u'system',\n",
" u'user']"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"X = vectorizer.fit_transform(corpus)\n",
"vectorizer.get_feature_names()"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"array([[1, 1, 1, 1, 0, 0, 0, 0],\n",
" [1, 0, 0, 0, 0, 1, 1, 1],\n",
" [0, 0, 1, 0, 1, 0, 1, 1],\n",
" [0, 1, 1, 1, 0, 0, 0, 1],\n",
" [0, 1, 1, 1, 0, 0, 0, 1]])"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data = X.toarray()\n",
"data"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>computer</th>\n",
" <th>human</th>\n",
" <th>interface</th>\n",
" <th>machine</th>\n",
" <th>management</th>\n",
" <th>survey</th>\n",
" <th>system</th>\n",
" <th>user</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>d1</th>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>d2</th>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>d3</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>d4</th>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>q</th>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" computer human interface machine management survey system user\n",
"d1 1 1 1 1 0 0 0 0\n",
"d2 1 0 0 0 0 1 1 1\n",
"d3 0 0 1 0 1 0 1 1\n",
"d4 0 1 1 1 0 0 0 1\n",
"q 0 1 1 1 0 0 0 1"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"df = pd.DataFrame(data = data, columns=vectorizer.get_feature_names(), index=['d1', 'd2', 'd3', 'd4', 'q'])\n",
"display(df)"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"(array([ 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5]),\n",
" array([ 0.5 , 0.5 , 0.75, 0.5 , 0.25, 0.25, 0.5 , 0.75]))"
]
},
"execution_count": 32,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"p = np.ones(data.shape[1]) * 0.5\n",
"r = np.sum(data[:4], axis=0) / (np.ones(data.shape[1]) * 4)\n",
"p, r"
]
},
{
"cell_type": "code",
"execution_count": 40,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>computer</th>\n",
" <th>human</th>\n",
" <th>interface</th>\n",
" <th>machine</th>\n",
" <th>management</th>\n",
" <th>survey</th>\n",
" <th>system</th>\n",
" <th>user</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>d1</th>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>1.00</td>\n",
" <td>1.0</td>\n",
" <td>0.00</td>\n",
" <td>0.00</td>\n",
" <td>0.0</td>\n",
" <td>0.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>d2</th>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.00</td>\n",
" <td>0.0</td>\n",
" <td>0.00</td>\n",
" <td>1.00</td>\n",
" <td>1.0</td>\n",
" <td>1.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>d3</th>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.00</td>\n",
" <td>0.0</td>\n",
" <td>1.00</td>\n",
" <td>0.00</td>\n",
" <td>1.0</td>\n",
" <td>1.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>d4</th>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>1.00</td>\n",
" <td>1.0</td>\n",
" <td>0.00</td>\n",
" <td>0.00</td>\n",
" <td>0.0</td>\n",
" <td>1.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>q</th>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>1.00</td>\n",
" <td>1.0</td>\n",
" <td>0.00</td>\n",
" <td>0.00</td>\n",
" <td>0.0</td>\n",
" <td>1.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>p</th>\n",
" <td>0.5</td>\n",
" <td>0.5</td>\n",
" <td>0.50</td>\n",
" <td>0.5</td>\n",
" <td>0.50</td>\n",
" <td>0.50</td>\n",
" <td>0.5</td>\n",
" <td>0.50</td>\n",
" </tr>\n",
" <tr>\n",
" <th>r</th>\n",
" <td>0.5</td>\n",
" <td>0.5</td>\n",
" <td>0.75</td>\n",
" <td>0.5</td>\n",
" <td>0.25</td>\n",
" <td>0.25</td>\n",
" <td>0.5</td>\n",
" <td>0.75</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" computer human interface machine management survey system user\n",
"d1 1.0 1.0 1.00 1.0 0.00 0.00 0.0 0.00\n",
"d2 1.0 0.0 0.00 0.0 0.00 1.00 1.0 1.00\n",
"d3 0.0 0.0 1.00 0.0 1.00 0.00 1.0 1.00\n",
"d4 0.0 1.0 1.00 1.0 0.00 0.00 0.0 1.00\n",
"q 0.0 1.0 1.00 1.0 0.00 0.00 0.0 1.00\n",
"p 0.5 0.5 0.50 0.5 0.50 0.50 0.5 0.50\n",
"r 0.5 0.5 0.75 0.5 0.25 0.25 0.5 0.75"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"df = pd.DataFrame(data = np.vstack((data, p, r)), columns=vectorizer.get_feature_names(), index=['d1', 'd2', 'd3', 'd4', 'q', 'p', 'r'])\n",
"display(df)"
]
},
{
"cell_type": "code",
"execution_count": 116,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"def logOp(c):\n",
" a = c[0]\n",
" b = c[1]\n",
" return np.log10(a/(1-a) * (1-b)/b)\n",
"\n",
"def sim(idx):\n",
" indices = np.nonzero(np.bitwise_and(data[idx], data[-1]))\n",
" tmp =np.apply_along_axis(logOp, 0, np.vstack((p[indices], r[indices])))\n",
" # print np.vstack((p[indices], r[indices]))\n",
" return np.sum(tmp)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Estimates of P(xi | R)"
]
},
{
"cell_type": "code",
"execution_count": 117,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>similarity</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>d1</th>\n",
" <td>-0.477121</td>\n",
" </tr>\n",
" <tr>\n",
" <th>d2</th>\n",
" <td>-0.477121</td>\n",
" </tr>\n",
" <tr>\n",
" <th>d3</th>\n",
" <td>-0.954243</td>\n",
" </tr>\n",
" <tr>\n",
" <th>d4</th>\n",
" <td>-0.954243</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" similarity\n",
"d1 -0.477121\n",
"d2 -0.477121\n",
"d3 -0.954243\n",
"d4 -0.954243"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"df2 = pd.DataFrame(data = np.array([sim(0), sim(1), sim(2), sim(3)]), columns=['similarity'], index=['d1', 'd2', 'd3', 'd4'])\n",
"display(df2.sort_values('similarity', ascending=False))"
]
},
{
"cell_type": "code",
"execution_count": 118,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"ni = np.sum(data[:4], axis=0)\n",
"Vi = data[0] + data[1]"
]
},
{
"cell_type": "code",
"execution_count": 119,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def pi_comp(i):\n",
" return (Vi[i] + ni[i]/4) / 3\n",
"\n",
"def ri_comp(i):\n",
" return (ni[i] - Vi[i] + ni[i]/4) / 3"
]
},
{
"cell_type": "code",
"execution_count": 120,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"array([ 0.83333333, 0.5 , 0.58333333, 0.5 , 0.08333333,\n",
" 0.41666667, 0.5 , 0.58333333])"
]
},
"execution_count": 120,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pi = np.apply_along_axis(pi_comp, 0, np.arange(data.shape[1]))\n",
"pi"
]
},
{
"cell_type": "code",
"execution_count": 121,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"array([ 0.16666667, 0.5 , 0.91666667, 0.5 , 0.41666667,\n",
" 0.08333333, 0.5 , 0.91666667])"
]
},
"execution_count": 121,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ri = np.array(map(ri_comp, np.arange(data.shape[1])))\n",
"ri"
]
},
{
"cell_type": "code",
"execution_count": 122,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>computer</th>\n",
" <th>human</th>\n",
" <th>interface</th>\n",
" <th>machine</th>\n",
" <th>management</th>\n",
" <th>survey</th>\n",
" <th>system</th>\n",
" <th>user</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>d1</th>\n",
" <td>1.000000</td>\n",
" <td>1.0</td>\n",
" <td>1.000000</td>\n",
" <td>1.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>d2</th>\n",
" <td>1.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>0.000000</td>\n",
" <td>1.000000</td>\n",
" <td>1.0</td>\n",
" <td>1.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>d3</th>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>1.000000</td>\n",
" <td>0.0</td>\n",
" <td>1.000000</td>\n",
" <td>0.000000</td>\n",
" <td>1.0</td>\n",
" <td>1.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>d4</th>\n",
" <td>0.000000</td>\n",
" <td>1.0</td>\n",
" <td>1.000000</td>\n",
" <td>1.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>1.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>q</th>\n",
" <td>0.000000</td>\n",
" <td>1.0</td>\n",
" <td>1.000000</td>\n",
" <td>1.0</td>\n",
" <td>0.000000</td>\n",
" <td>0.000000</td>\n",
" <td>0.0</td>\n",
" <td>1.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>ni</th>\n",
" <td>2.000000</td>\n",
" <td>2.0</td>\n",
" <td>3.000000</td>\n",
" <td>2.0</td>\n",
" <td>1.000000</td>\n",
" <td>1.000000</td>\n",
" <td>2.0</td>\n",
" <td>3.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Vi</th>\n",
" <td>2.000000</td>\n",
" <td>1.0</td>\n",
" <td>1.000000</td>\n",
" <td>1.0</td>\n",
" <td>0.000000</td>\n",
" <td>1.000000</td>\n",
" <td>1.0</td>\n",
" <td>1.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>pi</th>\n",
" <td>0.833333</td>\n",
" <td>0.5</td>\n",
" <td>0.583333</td>\n",
" <td>0.5</td>\n",
" <td>0.083333</td>\n",
" <td>0.416667</td>\n",
" <td>0.5</td>\n",
" <td>0.583333</td>\n",
" </tr>\n",
" <tr>\n",
" <th>ri</th>\n",
" <td>0.166667</td>\n",
" <td>0.5</td>\n",
" <td>0.916667</td>\n",
" <td>0.5</td>\n",
" <td>0.416667</td>\n",
" <td>0.083333</td>\n",
" <td>0.5</td>\n",
" <td>0.916667</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" computer human interface machine management survey system \\\n",
"d1 1.000000 1.0 1.000000 1.0 0.000000 0.000000 0.0 \n",
"d2 1.000000 0.0 0.000000 0.0 0.000000 1.000000 1.0 \n",
"d3 0.000000 0.0 1.000000 0.0 1.000000 0.000000 1.0 \n",
"d4 0.000000 1.0 1.000000 1.0 0.000000 0.000000 0.0 \n",
"q 0.000000 1.0 1.000000 1.0 0.000000 0.000000 0.0 \n",
"ni 2.000000 2.0 3.000000 2.0 1.000000 1.000000 2.0 \n",
"Vi 2.000000 1.0 1.000000 1.0 0.000000 1.000000 1.0 \n",
"pi 0.833333 0.5 0.583333 0.5 0.083333 0.416667 0.5 \n",
"ri 0.166667 0.5 0.916667 0.5 0.416667 0.083333 0.5 \n",
"\n",
" user \n",
"d1 0.000000 \n",
"d2 1.000000 \n",
"d3 1.000000 \n",
"d4 1.000000 \n",
"q 1.000000 \n",
"ni 3.000000 \n",
"Vi 1.000000 \n",
"pi 0.583333 \n",
"ri 0.916667 "
]
},
"execution_count": 122,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pd.DataFrame(data=np.vstack((data, ni, Vi, pi, ri)), columns=vectorizer.get_feature_names(), index=['d1', 'd2', 'd3', 'd4', 'q', 'ni', 'Vi', 'pi', 'ri'])"
]
},
{
"cell_type": "code",
"execution_count": 123,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def sim2(idx):\n",
" indices = np.nonzero(np.bitwise_and(data[idx], data[-1]))\n",
" tmp =np.apply_along_axis(logOp, 0, np.vstack((pi[indices], ri[indices])))\n",
" return np.sum(tmp)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Improving Estimates of P(xi | R)"
]
},
{
"cell_type": "code",
"execution_count": 124,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>similarity</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>d1</th>\n",
" <td>-0.477121</td>\n",
" </tr>\n",
" <tr>\n",
" <th>d2</th>\n",
" <td>-0.477121</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" similarity\n",
"d1 -0.477121\n",
"d2 -0.477121"
]
},
"execution_count": 124,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pd.DataFrame(data = np.array([sim(0), sim(1)]), columns=['similarity'], index=['d1', 'd2'])"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 2",
"language": "python",
"name": "python2"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.5"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment