Kautenja · September 25, 2017 23:41
diff --git a/Distance Functions.ipynb b/Distance Functions.ipynb
 {
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Math courtesy of https://numerics.mathdotnet.com/distance.html"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 94,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "from pandas import DataFrame\n",
    "from numpy.random import randint, seed"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 95,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# the printable 95 characters in the ASCII table\n",
    "PRINTABLE = list(' !\"#$%&\\'()*+,-./:;<=>?@[]\\\\^_`~}{|0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 96,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def random_vector(random_state: int, columns: list=PRINTABLE, low: int=0, high: int=10000) -> pd.DataFrame:\n",
    "    \"\"\"\n",
    "    Return a random vector.\n",
    "    \n",
    "    Args:\n",
    "        random_state: the random state to use\n",
    "        columns: the name of the random columns\n",
    "        low: the low value for the random range\n",
    "        high: the high value for the random range\n",
    "        \n",
    "    Returns: a random dataframe with the columns\n",
    "    \"\"\"\n",
    "    if random_state is not None:\n",
    "        seed(random_state)\n",
    "    values = randint(low=low, high=high, size=len(columns)).reshape(1, len(columns))\n",
    "    return pd.DataFrame(values, columns=columns)        "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 97,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th>!</th>\n",
       "      <th>\"</th>\n",
       "      <th>#</th>\n",
       "      <th>$</th>\n",
       "      <th>%</th>\n",
       "      <th>&amp;</th>\n",
       "      <th>'</th>\n",
       "      <th>(</th>\n",
       "      <th>)</th>\n",
       "      <th>...</th>\n",
       "      <th>Q</th>\n",
       "      <th>R</th>\n",
       "      <th>S</th>\n",
       "      <th>T</th>\n",
       "      <th>U</th>\n",
       "      <th>V</th>\n",
       "      <th>W</th>\n",
       "      <th>X</th>\n",
       "      <th>Y</th>\n",
       "      <th>Z</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>235</td>\n",
       "      <td>5192</td>\n",
       "      <td>905</td>\n",
       "      <td>7813</td>\n",
       "      <td>2895</td>\n",
       "      <td>5056</td>\n",
       "      <td>144</td>\n",
       "      <td>4225</td>\n",
       "      <td>7751</td>\n",
       "      <td>3462</td>\n",
       "      <td>...</td>\n",
       "      <td>4059</td>\n",
       "      <td>715</td>\n",
       "      <td>6407</td>\n",
       "      <td>6221</td>\n",
       "      <td>2760</td>\n",
       "      <td>5195</td>\n",
       "      <td>7500</td>\n",
       "      <td>1067</td>\n",
       "      <td>7710</td>\n",
       "      <td>9764</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>1 rows × 95 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "           !    \"     #     $     %    &     '     (     )  ...      Q    R  \\\n",
       "0  235  5192  905  7813  2895  5056  144  4225  7751  3462  ...   4059  715   \n",
       "\n",
       "      S     T     U     V     W     X     Y     Z  \n",
       "0  6407  6221  2760  5195  7500  1067  7710  9764  \n",
       "\n",
       "[1 rows x 95 columns]"
      ]
     },
     "execution_count": 97,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "random_vector(1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 98,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def random_tensor(random_state: int, size: int) -> pd.DataFrame:\n",
    "    \"\"\"\n",
    "    Return a random tensor.\n",
    "    \n",
    "    Args:\n",
    "        random_state: the random state for the tensor. will use\n",
    "                      [random_state, random_state + size] states\n",
    "        size: the size of the tensor to generate\n",
    "        \n",
    "    Returns: a random dataframe of `size` rows\n",
    "    \"\"\"\n",
    "    return pd.concat([random_vector(random_state + i) for i in range (0, size)])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 100,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th>!</th>\n",
       "      <th>\"</th>\n",
       "      <th>#</th>\n",
       "      <th>$</th>\n",
       "      <th>%</th>\n",
       "      <th>&amp;</th>\n",
       "      <th>'</th>\n",
       "      <th>(</th>\n",
       "      <th>)</th>\n",
       "      <th>...</th>\n",
       "      <th>Q</th>\n",
       "      <th>R</th>\n",
       "      <th>S</th>\n",
       "      <th>T</th>\n",
       "      <th>U</th>\n",
       "      <th>V</th>\n",
       "      <th>W</th>\n",
       "      <th>X</th>\n",
       "      <th>Y</th>\n",
       "      <th>Z</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>2732</td>\n",
       "      <td>9845</td>\n",
       "      <td>3264</td>\n",
       "      <td>4859</td>\n",
       "      <td>9225</td>\n",
       "      <td>7891</td>\n",
       "      <td>4373</td>\n",
       "      <td>5874</td>\n",
       "      <td>6744</td>\n",
       "      <td>3468</td>\n",
       "      <td>...</td>\n",
       "      <td>973</td>\n",
       "      <td>4464</td>\n",
       "      <td>8393</td>\n",
       "      <td>2418</td>\n",
       "      <td>3455</td>\n",
       "      <td>6167</td>\n",
       "      <td>5819</td>\n",
       "      <td>6521</td>\n",
       "      <td>6242</td>\n",
       "      <td>7742</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>235</td>\n",
       "      <td>5192</td>\n",
       "      <td>905</td>\n",
       "      <td>7813</td>\n",
       "      <td>2895</td>\n",
       "      <td>5056</td>\n",
       "      <td>144</td>\n",
       "      <td>4225</td>\n",
       "      <td>7751</td>\n",
       "      <td>3462</td>\n",
       "      <td>...</td>\n",
       "      <td>4059</td>\n",
       "      <td>715</td>\n",
       "      <td>6407</td>\n",
       "      <td>6221</td>\n",
       "      <td>2760</td>\n",
       "      <td>5195</td>\n",
       "      <td>7500</td>\n",
       "      <td>1067</td>\n",
       "      <td>7710</td>\n",
       "      <td>9764</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>2 rows × 95 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "            !     \"     #     $     %     &     '     (     )  ...      Q  \\\n",
       "0  2732  9845  3264  4859  9225  7891  4373  5874  6744  3468  ...    973   \n",
       "0   235  5192   905  7813  2895  5056   144  4225  7751  3462  ...   4059   \n",
       "\n",
       "      R     S     T     U     V     W     X     Y     Z  \n",
       "0  4464  8393  2418  3455  6167  5819  6521  6242  7742  \n",
       "0   715  6407  6221  2760  5195  7500  1067  7710  9764  \n",
       "\n",
       "[2 rows x 95 columns]"
      ]
     },
     "execution_count": 100,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = random_tensor(0, 2)\n",
    "df"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Sum of Absolute Difference (SAD)\n",
    "\n",
    "* equivalent to $L_1$ norm\n",
    "* manhattan distance\n",
    "* taxicab-norm"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 101,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def manhattan_distance(this: DataFrame, that: DataFrame) -> float:\n",
    "    return (this - that).abs().sum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 102,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "308704"
      ]
     },
     "execution_count": 102,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "manhattan_distance(df.iloc[0], df.iloc[1])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Sum of Squared Differences (SSD)\n",
    "\n",
    "* equivalent to squared $L_2$ norm\n",
    "* euclidean norm\n",
    "* sensitive to outliers"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 103,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def euclidean_norm(this: DataFrame, that: DataFrame) -> float:\n",
    "    return ((this - that) ** 2).sum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 104,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1509487768"
      ]
     },
     "execution_count": 104,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "euclidean_norm(df.iloc[0], df.iloc[1])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Mean-Absolute Error (MAE)\n",
    "\n",
    "* normalized version of the sum of absolute difference.\n",
    "* normalized manhattan distance"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 105,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def mean_absolute_error(this: DataFrame, that: DataFrame) -> float:\n",
    "    return (this - that).abs().sum() / this.index.size"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 106,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "3249.5157894736844"
      ]
     },
     "execution_count": 106,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "mean_absolute_error(df.iloc[0], df.iloc[1])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Mean Squared Error (MSE)\n",
    "\n",
    "* normalized version of the sum of squared differences"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 107,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def mean_squared_error(this: DataFrame, that: DataFrame) -> float:\n",
    "    return ((this - that) ** 2).sum() / this.index.size"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 108,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "15889344.92631579"
      ]
     },
     "execution_count": 108,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "mean_squared_error(df.iloc[0], df.iloc[1])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Euclidean Distance\n",
    "\n",
    "* $L_2$-norm of the difference\n",
    "* a special case of the Minkowski distance with p=2. \n",
    "* It is the natural distance in a geometric interpretation."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 109,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def euclidean_distance(this: DataFrame, that: DataFrame) -> float:\n",
    "    return ((this - that) ** 2).sum() ** (0.5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 110,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "38852.126943064519"
      ]
     },
     "execution_count": 110,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "euclidean_distance(df.iloc[0], df.iloc[1])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Chebyshev Distance\n",
    "\n",
    "* $L_{\\infty}$-norm of the difference\n",
    "* a special case of the Minkowski distance where $p \\to \\infty$\n",
    "* It is also known as Chessboard distance."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 111,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def chebyshev_distance(this: DataFrame, that: DataFrame) -> float:\n",
    "    return (this - that).abs().max()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 112,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "8933"
      ]
     },
     "execution_count": 112,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "chebyshev_distance(df.iloc[0], df.iloc[1])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Minkowki Distance\n",
    "\n",
    "* the generalized $L_p$-norm of the difference. \n",
    "* additional argument $p$"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 113,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def minkowski_distance(this: DataFrame, that: DataFrame, p: float) -> float:\n",
    "    return ((this - that).abs() ** p).sum() ** (1.0 / p)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 114,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "20750.088857099327"
      ]
     },
     "execution_count": 114,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "minkowski_distance(df.iloc[0], df.iloc[1], 3)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Canberra Distance\n",
    "\n",
    "* a weighted version of the Manhattan distance\n",
    "* It is often used for data scattered around an origin\n",
    "    * it is biased for measures around the origin and very sensitive for values close to zero."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 115,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def canberra_distance(this: DataFrame, that: DataFrame) -> float:\n",
    "    # this can have some divide by zeroes that are ignored using replace to remmove nan and inf\n",
    "    return ((this - that).abs() / this.abs() + that.abs()).replace([np.nan, np.inf], 0).sum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 116,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "425381.53926460177"
      ]
     },
     "execution_count": 116,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "canberra_distance(df.iloc[0], df.iloc[1])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Cosine Distance\n",
    "\n",
    "* contains the dot product scaled by the product of the Euclidean distances from the origin. \n",
    "* represents the angular distance of two vectors while ignoring their scale."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 117,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def cosine_distance(this: DataFrame, that: DataFrame) -> float:\n",
    "    num = (this * that).sum()\n",
    "    den = ((this ** 2).sum() ** (0.5)) * ((that ** 2).sum() ** (0.5))\n",
    "    return num / den"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 118,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.75156642203019353"
      ]
     },
     "execution_count": 118,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "cosine_distance(df.iloc[0], df.iloc[1])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Pearson's Distance\n",
    "\n",
    "The Pearson distance is a correlation distance based on Pearson's product-momentum correlation coefficient of the two sample vectors. Since the correlation coefficient falls between [-1, 1], the Pearson distance lies in [0, 2] and measures the linear relationship between the two vectors."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 119,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def pearsons_distance(this: DataFrame, that: DataFrame) -> float:\n",
    "    return this.corr(that)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 120,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.096958032257038002"
      ]
     },
     "execution_count": 120,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pearsons_distance(df.iloc[0], df.iloc[1])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Hamming Distance\n",
    "\n",
    "the number of items in the vector that dont match"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 121,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def hamming_distance(this: DataFrame, that: DataFrame) -> float:\n",
    "    return (this != that).sum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 122,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "95"
      ]
     },
     "execution_count": 122,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "hamming_distance(df.iloc[0], df.iloc[1])"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
 }
	{
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Math courtesy of https://numerics.mathdotnet.com/distance.html"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 94,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"import numpy as np\n",
	"import pandas as pd\n",
	"from pandas import DataFrame\n",
	"from numpy.random import randint, seed"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 95,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"# the printable 95 characters in the ASCII table\n",
	"PRINTABLE = list(' !\"#$%&\\'()*+,-./:;<=>?@[]\\\\^_`~}{\|0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ')"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 96,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"def random_vector(random_state: int, columns: list=PRINTABLE, low: int=0, high: int=10000) -> pd.DataFrame:\n",
	" \"\"\"\n",
	" Return a random vector.\n",
	" \n",
	" Args:\n",
	" random_state: the random state to use\n",
	" columns: the name of the random columns\n",
	" low: the low value for the random range\n",
	" high: the high value for the random range\n",
	" \n",
	" Returns: a random dataframe with the columns\n",
	" \"\"\"\n",
	" if random_state is not None:\n",
	" seed(random_state)\n",
	" values = randint(low=low, high=high, size=len(columns)).reshape(1, len(columns))\n",
	" return pd.DataFrame(values, columns=columns) "
	]
	},
	{
	"cell_type": "code",
	"execution_count": 97,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/html": [
	"<div>\n",
	"<style>\n",
	" .dataframe thead tr:only-child th {\n",
	" text-align: right;\n",
	" }\n",
	"\n",
	" .dataframe thead th {\n",
	" text-align: left;\n",
	" }\n",
	"\n",
	" .dataframe tbody tr th {\n",
	" vertical-align: top;\n",
	" }\n",
	"</style>\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th></th>\n",
	" <th>!</th>\n",
	" <th>\"</th>\n",
	" <th>#</th>\n",
	" <th>$</th>\n",
	" <th>%</th>\n",
	" <th>&</th>\n",
	" <th>'</th>\n",
	" <th>(</th>\n",
	" <th>)</th>\n",
	" <th>...</th>\n",
	" <th>Q</th>\n",
	" <th>R</th>\n",
	" <th>S</th>\n",
	" <th>T</th>\n",
	" <th>U</th>\n",
	" <th>V</th>\n",
	" <th>W</th>\n",
	" <th>X</th>\n",
	" <th>Y</th>\n",
	" <th>Z</th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <th>0</th>\n",
	" <td>235</td>\n",
	" <td>5192</td>\n",
	" <td>905</td>\n",
	" <td>7813</td>\n",
	" <td>2895</td>\n",
	" <td>5056</td>\n",
	" <td>144</td>\n",
	" <td>4225</td>\n",
	" <td>7751</td>\n",
	" <td>3462</td>\n",
	" <td>...</td>\n",
	" <td>4059</td>\n",
	" <td>715</td>\n",
	" <td>6407</td>\n",
	" <td>6221</td>\n",
	" <td>2760</td>\n",
	" <td>5195</td>\n",
	" <td>7500</td>\n",
	" <td>1067</td>\n",
	" <td>7710</td>\n",
	" <td>9764</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"<p>1 rows × 95 columns</p>\n",
	"</div>"
	],
	"text/plain": [
	" ! \" # $ % & ' ( ) ... Q R \\\n",
	"0 235 5192 905 7813 2895 5056 144 4225 7751 3462 ... 4059 715 \n",
	"\n",
	" S T U V W X Y Z \n",
	"0 6407 6221 2760 5195 7500 1067 7710 9764 \n",
	"\n",
	"[1 rows x 95 columns]"
	]
	},
	"execution_count": 97,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"random_vector(1)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 98,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"def random_tensor(random_state: int, size: int) -> pd.DataFrame:\n",
	" \"\"\"\n",
	" Return a random tensor.\n",
	" \n",
	" Args:\n",
	" random_state: the random state for the tensor. will use\n",
	" [random_state, random_state + size] states\n",
	" size: the size of the tensor to generate\n",
	" \n",
	" Returns: a random dataframe of `size` rows\n",
	" \"\"\"\n",
	" return pd.concat([random_vector(random_state + i) for i in range (0, size)])"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 100,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/html": [
	"<div>\n",
	"<style>\n",
	" .dataframe thead tr:only-child th {\n",
	" text-align: right;\n",
	" }\n",
	"\n",
	" .dataframe thead th {\n",
	" text-align: left;\n",
	" }\n",
	"\n",
	" .dataframe tbody tr th {\n",
	" vertical-align: top;\n",
	" }\n",
	"</style>\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th></th>\n",
	" <th>!</th>\n",
	" <th>\"</th>\n",
	" <th>#</th>\n",
	" <th>$</th>\n",
	" <th>%</th>\n",
	" <th>&</th>\n",
	" <th>'</th>\n",
	" <th>(</th>\n",
	" <th>)</th>\n",
	" <th>...</th>\n",
	" <th>Q</th>\n",
	" <th>R</th>\n",
	" <th>S</th>\n",
	" <th>T</th>\n",
	" <th>U</th>\n",
	" <th>V</th>\n",
	" <th>W</th>\n",
	" <th>X</th>\n",
	" <th>Y</th>\n",
	" <th>Z</th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <th>0</th>\n",
	" <td>2732</td>\n",
	" <td>9845</td>\n",
	" <td>3264</td>\n",
	" <td>4859</td>\n",
	" <td>9225</td>\n",
	" <td>7891</td>\n",
	" <td>4373</td>\n",
	" <td>5874</td>\n",
	" <td>6744</td>\n",
	" <td>3468</td>\n",
	" <td>...</td>\n",
	" <td>973</td>\n",
	" <td>4464</td>\n",
	" <td>8393</td>\n",
	" <td>2418</td>\n",
	" <td>3455</td>\n",
	" <td>6167</td>\n",
	" <td>5819</td>\n",
	" <td>6521</td>\n",
	" <td>6242</td>\n",
	" <td>7742</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>0</th>\n",
	" <td>235</td>\n",
	" <td>5192</td>\n",
	" <td>905</td>\n",
	" <td>7813</td>\n",
	" <td>2895</td>\n",
	" <td>5056</td>\n",
	" <td>144</td>\n",
	" <td>4225</td>\n",
	" <td>7751</td>\n",
	" <td>3462</td>\n",
	" <td>...</td>\n",
	" <td>4059</td>\n",
	" <td>715</td>\n",
	" <td>6407</td>\n",
	" <td>6221</td>\n",
	" <td>2760</td>\n",
	" <td>5195</td>\n",
	" <td>7500</td>\n",
	" <td>1067</td>\n",
	" <td>7710</td>\n",
	" <td>9764</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"<p>2 rows × 95 columns</p>\n",
	"</div>"
	],
	"text/plain": [
	" ! \" # $ % & ' ( ) ... Q \\\n",
	"0 2732 9845 3264 4859 9225 7891 4373 5874 6744 3468 ... 973 \n",
	"0 235 5192 905 7813 2895 5056 144 4225 7751 3462 ... 4059 \n",
	"\n",
	" R S T U V W X Y Z \n",
	"0 4464 8393 2418 3455 6167 5819 6521 6242 7742 \n",
	"0 715 6407 6221 2760 5195 7500 1067 7710 9764 \n",
	"\n",
	"[2 rows x 95 columns]"
	]
	},
	"execution_count": 100,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"df = random_tensor(0, 2)\n",
	"df"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Sum of Absolute Difference (SAD)\n",
	"\n",
	"* equivalent to $L_1$ norm\n",
	"* manhattan distance\n",
	"* taxicab-norm"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 101,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"def manhattan_distance(this: DataFrame, that: DataFrame) -> float:\n",
	" return (this - that).abs().sum()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 102,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"308704"
	]
	},
	"execution_count": 102,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"manhattan_distance(df.iloc[0], df.iloc[1])"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Sum of Squared Differences (SSD)\n",
	"\n",
	"* equivalent to squared $L_2$ norm\n",
	"* euclidean norm\n",
	"* sensitive to outliers"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 103,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"def euclidean_norm(this: DataFrame, that: DataFrame) -> float:\n",
	" return ((this - that) ** 2).sum()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 104,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"1509487768"
	]
	},
	"execution_count": 104,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"euclidean_norm(df.iloc[0], df.iloc[1])"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Mean-Absolute Error (MAE)\n",
	"\n",
	"* normalized version of the sum of absolute difference.\n",
	"* normalized manhattan distance"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 105,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"def mean_absolute_error(this: DataFrame, that: DataFrame) -> float:\n",
	" return (this - that).abs().sum() / this.index.size"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 106,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"3249.5157894736844"
	]
	},
	"execution_count": 106,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"mean_absolute_error(df.iloc[0], df.iloc[1])"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Mean Squared Error (MSE)\n",
	"\n",
	"* normalized version of the sum of squared differences"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 107,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"def mean_squared_error(this: DataFrame, that: DataFrame) -> float:\n",
	" return ((this - that) ** 2).sum() / this.index.size"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 108,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"15889344.92631579"
	]
	},
	"execution_count": 108,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"mean_squared_error(df.iloc[0], df.iloc[1])"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Euclidean Distance\n",
	"\n",
	"* $L_2$-norm of the difference\n",
	"* a special case of the Minkowski distance with p=2. \n",
	"* It is the natural distance in a geometric interpretation."
	]
	},
	{
	"cell_type": "code",
	"execution_count": 109,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"def euclidean_distance(this: DataFrame, that: DataFrame) -> float:\n",
	" return ((this - that) 2).sum() (0.5)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 110,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"38852.126943064519"
	]
	},
	"execution_count": 110,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"euclidean_distance(df.iloc[0], df.iloc[1])"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Chebyshev Distance\n",
	"\n",
	"* $L_{\\infty}$-norm of the difference\n",
	"* a special case of the Minkowski distance where $p \\to \\infty$\n",
	"* It is also known as Chessboard distance."
	]
	},
	{
	"cell_type": "code",
	"execution_count": 111,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"def chebyshev_distance(this: DataFrame, that: DataFrame) -> float:\n",
	" return (this - that).abs().max()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 112,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"8933"
	]
	},
	"execution_count": 112,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"chebyshev_distance(df.iloc[0], df.iloc[1])"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Minkowki Distance\n",
	"\n",
	"* the generalized $L_p$-norm of the difference. \n",
	"* additional argument $p$"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 113,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"def minkowski_distance(this: DataFrame, that: DataFrame, p: float) -> float:\n",
	" return ((this - that).abs() p).sum() (1.0 / p)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 114,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"20750.088857099327"
	]
	},
	"execution_count": 114,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"minkowski_distance(df.iloc[0], df.iloc[1], 3)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Canberra Distance\n",
	"\n",
	"* a weighted version of the Manhattan distance\n",
	"* It is often used for data scattered around an origin\n",
	" * it is biased for measures around the origin and very sensitive for values close to zero."
	]
	},
	{
	"cell_type": "code",
	"execution_count": 115,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"def canberra_distance(this: DataFrame, that: DataFrame) -> float:\n",
	" # this can have some divide by zeroes that are ignored using replace to remmove nan and inf\n",
	" return ((this - that).abs() / this.abs() + that.abs()).replace([np.nan, np.inf], 0).sum()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 116,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"425381.53926460177"
	]
	},
	"execution_count": 116,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"canberra_distance(df.iloc[0], df.iloc[1])"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Cosine Distance\n",
	"\n",
	"* contains the dot product scaled by the product of the Euclidean distances from the origin. \n",
	"* represents the angular distance of two vectors while ignoring their scale."
	]
	},
	{
	"cell_type": "code",
	"execution_count": 117,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"def cosine_distance(this: DataFrame, that: DataFrame) -> float:\n",
	" num = (this * that).sum()\n",
	" den = ((this 2).sum() (0.5)) * ((that 2).sum() (0.5))\n",
	" return num / den"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 118,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"0.75156642203019353"
	]
	},
	"execution_count": 118,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"cosine_distance(df.iloc[0], df.iloc[1])"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Pearson's Distance\n",
	"\n",
	"The Pearson distance is a correlation distance based on Pearson's product-momentum correlation coefficient of the two sample vectors. Since the correlation coefficient falls between [-1, 1], the Pearson distance lies in [0, 2] and measures the linear relationship between the two vectors."
	]
	},
	{
	"cell_type": "code",
	"execution_count": 119,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"def pearsons_distance(this: DataFrame, that: DataFrame) -> float:\n",
	" return this.corr(that)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 120,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"0.096958032257038002"
	]
	},
	"execution_count": 120,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"pearsons_distance(df.iloc[0], df.iloc[1])"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Hamming Distance\n",
	"\n",
	"the number of items in the vector that dont match"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 121,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"def hamming_distance(this: DataFrame, that: DataFrame) -> float:\n",
	" return (this != that).sum()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 122,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"95"
	]
	},
	"execution_count": 122,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"hamming_distance(df.iloc[0], df.iloc[1])"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.6.2"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 2
	}