Skip to content

Instantly share code, notes, and snippets.

@Kautenja
Created September 25, 2017 23:41
Show Gist options
  • Save Kautenja/6f00dc10e42a65ccd1f0ed1d4512d550 to your computer and use it in GitHub Desktop.
Save Kautenja/6f00dc10e42a65ccd1f0ed1d4512d550 to your computer and use it in GitHub Desktop.
Implementations of different distance functions in Python
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Math courtesy of https://numerics.mathdotnet.com/distance.html"
]
},
{
"cell_type": "code",
"execution_count": 94,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"from pandas import DataFrame\n",
"from numpy.random import randint, seed"
]
},
{
"cell_type": "code",
"execution_count": 95,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"# the printable 95 characters in the ASCII table\n",
"PRINTABLE = list(' !\"#$%&\\'()*+,-./:;<=>?@[]\\\\^_`~}{|0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ')"
]
},
{
"cell_type": "code",
"execution_count": 96,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def random_vector(random_state: int, columns: list=PRINTABLE, low: int=0, high: int=10000) -> pd.DataFrame:\n",
" \"\"\"\n",
" Return a random vector.\n",
" \n",
" Args:\n",
" random_state: the random state to use\n",
" columns: the name of the random columns\n",
" low: the low value for the random range\n",
" high: the high value for the random range\n",
" \n",
" Returns: a random dataframe with the columns\n",
" \"\"\"\n",
" if random_state is not None:\n",
" seed(random_state)\n",
" values = randint(low=low, high=high, size=len(columns)).reshape(1, len(columns))\n",
" return pd.DataFrame(values, columns=columns) "
]
},
{
"cell_type": "code",
"execution_count": 97,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style>\n",
" .dataframe thead tr:only-child th {\n",
" text-align: right;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: left;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th></th>\n",
" <th>!</th>\n",
" <th>\"</th>\n",
" <th>#</th>\n",
" <th>$</th>\n",
" <th>%</th>\n",
" <th>&amp;</th>\n",
" <th>'</th>\n",
" <th>(</th>\n",
" <th>)</th>\n",
" <th>...</th>\n",
" <th>Q</th>\n",
" <th>R</th>\n",
" <th>S</th>\n",
" <th>T</th>\n",
" <th>U</th>\n",
" <th>V</th>\n",
" <th>W</th>\n",
" <th>X</th>\n",
" <th>Y</th>\n",
" <th>Z</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>235</td>\n",
" <td>5192</td>\n",
" <td>905</td>\n",
" <td>7813</td>\n",
" <td>2895</td>\n",
" <td>5056</td>\n",
" <td>144</td>\n",
" <td>4225</td>\n",
" <td>7751</td>\n",
" <td>3462</td>\n",
" <td>...</td>\n",
" <td>4059</td>\n",
" <td>715</td>\n",
" <td>6407</td>\n",
" <td>6221</td>\n",
" <td>2760</td>\n",
" <td>5195</td>\n",
" <td>7500</td>\n",
" <td>1067</td>\n",
" <td>7710</td>\n",
" <td>9764</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>1 rows × 95 columns</p>\n",
"</div>"
],
"text/plain": [
" ! \" # $ % & ' ( ) ... Q R \\\n",
"0 235 5192 905 7813 2895 5056 144 4225 7751 3462 ... 4059 715 \n",
"\n",
" S T U V W X Y Z \n",
"0 6407 6221 2760 5195 7500 1067 7710 9764 \n",
"\n",
"[1 rows x 95 columns]"
]
},
"execution_count": 97,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"random_vector(1)"
]
},
{
"cell_type": "code",
"execution_count": 98,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def random_tensor(random_state: int, size: int) -> pd.DataFrame:\n",
" \"\"\"\n",
" Return a random tensor.\n",
" \n",
" Args:\n",
" random_state: the random state for the tensor. will use\n",
" [random_state, random_state + size] states\n",
" size: the size of the tensor to generate\n",
" \n",
" Returns: a random dataframe of `size` rows\n",
" \"\"\"\n",
" return pd.concat([random_vector(random_state + i) for i in range (0, size)])"
]
},
{
"cell_type": "code",
"execution_count": 100,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style>\n",
" .dataframe thead tr:only-child th {\n",
" text-align: right;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: left;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th></th>\n",
" <th>!</th>\n",
" <th>\"</th>\n",
" <th>#</th>\n",
" <th>$</th>\n",
" <th>%</th>\n",
" <th>&amp;</th>\n",
" <th>'</th>\n",
" <th>(</th>\n",
" <th>)</th>\n",
" <th>...</th>\n",
" <th>Q</th>\n",
" <th>R</th>\n",
" <th>S</th>\n",
" <th>T</th>\n",
" <th>U</th>\n",
" <th>V</th>\n",
" <th>W</th>\n",
" <th>X</th>\n",
" <th>Y</th>\n",
" <th>Z</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>2732</td>\n",
" <td>9845</td>\n",
" <td>3264</td>\n",
" <td>4859</td>\n",
" <td>9225</td>\n",
" <td>7891</td>\n",
" <td>4373</td>\n",
" <td>5874</td>\n",
" <td>6744</td>\n",
" <td>3468</td>\n",
" <td>...</td>\n",
" <td>973</td>\n",
" <td>4464</td>\n",
" <td>8393</td>\n",
" <td>2418</td>\n",
" <td>3455</td>\n",
" <td>6167</td>\n",
" <td>5819</td>\n",
" <td>6521</td>\n",
" <td>6242</td>\n",
" <td>7742</td>\n",
" </tr>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>235</td>\n",
" <td>5192</td>\n",
" <td>905</td>\n",
" <td>7813</td>\n",
" <td>2895</td>\n",
" <td>5056</td>\n",
" <td>144</td>\n",
" <td>4225</td>\n",
" <td>7751</td>\n",
" <td>3462</td>\n",
" <td>...</td>\n",
" <td>4059</td>\n",
" <td>715</td>\n",
" <td>6407</td>\n",
" <td>6221</td>\n",
" <td>2760</td>\n",
" <td>5195</td>\n",
" <td>7500</td>\n",
" <td>1067</td>\n",
" <td>7710</td>\n",
" <td>9764</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>2 rows × 95 columns</p>\n",
"</div>"
],
"text/plain": [
" ! \" # $ % & ' ( ) ... Q \\\n",
"0 2732 9845 3264 4859 9225 7891 4373 5874 6744 3468 ... 973 \n",
"0 235 5192 905 7813 2895 5056 144 4225 7751 3462 ... 4059 \n",
"\n",
" R S T U V W X Y Z \n",
"0 4464 8393 2418 3455 6167 5819 6521 6242 7742 \n",
"0 715 6407 6221 2760 5195 7500 1067 7710 9764 \n",
"\n",
"[2 rows x 95 columns]"
]
},
"execution_count": 100,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = random_tensor(0, 2)\n",
"df"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Sum of Absolute Difference (SAD)\n",
"\n",
"* equivalent to $L_1$ norm\n",
"* manhattan distance\n",
"* taxicab-norm"
]
},
{
"cell_type": "code",
"execution_count": 101,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def manhattan_distance(this: DataFrame, that: DataFrame) -> float:\n",
" return (this - that).abs().sum()"
]
},
{
"cell_type": "code",
"execution_count": 102,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"308704"
]
},
"execution_count": 102,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"manhattan_distance(df.iloc[0], df.iloc[1])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Sum of Squared Differences (SSD)\n",
"\n",
"* equivalent to squared $L_2$ norm\n",
"* euclidean norm\n",
"* sensitive to outliers"
]
},
{
"cell_type": "code",
"execution_count": 103,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def euclidean_norm(this: DataFrame, that: DataFrame) -> float:\n",
" return ((this - that) ** 2).sum()"
]
},
{
"cell_type": "code",
"execution_count": 104,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"1509487768"
]
},
"execution_count": 104,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"euclidean_norm(df.iloc[0], df.iloc[1])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Mean-Absolute Error (MAE)\n",
"\n",
"* normalized version of the sum of absolute difference.\n",
"* normalized manhattan distance"
]
},
{
"cell_type": "code",
"execution_count": 105,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def mean_absolute_error(this: DataFrame, that: DataFrame) -> float:\n",
" return (this - that).abs().sum() / this.index.size"
]
},
{
"cell_type": "code",
"execution_count": 106,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"3249.5157894736844"
]
},
"execution_count": 106,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"mean_absolute_error(df.iloc[0], df.iloc[1])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Mean Squared Error (MSE)\n",
"\n",
"* normalized version of the sum of squared differences"
]
},
{
"cell_type": "code",
"execution_count": 107,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def mean_squared_error(this: DataFrame, that: DataFrame) -> float:\n",
" return ((this - that) ** 2).sum() / this.index.size"
]
},
{
"cell_type": "code",
"execution_count": 108,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"15889344.92631579"
]
},
"execution_count": 108,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"mean_squared_error(df.iloc[0], df.iloc[1])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Euclidean Distance\n",
"\n",
"* $L_2$-norm of the difference\n",
"* a special case of the Minkowski distance with p=2. \n",
"* It is the natural distance in a geometric interpretation."
]
},
{
"cell_type": "code",
"execution_count": 109,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def euclidean_distance(this: DataFrame, that: DataFrame) -> float:\n",
" return ((this - that) ** 2).sum() ** (0.5)"
]
},
{
"cell_type": "code",
"execution_count": 110,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"38852.126943064519"
]
},
"execution_count": 110,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"euclidean_distance(df.iloc[0], df.iloc[1])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Chebyshev Distance\n",
"\n",
"* $L_{\\infty}$-norm of the difference\n",
"* a special case of the Minkowski distance where $p \\to \\infty$\n",
"* It is also known as Chessboard distance."
]
},
{
"cell_type": "code",
"execution_count": 111,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def chebyshev_distance(this: DataFrame, that: DataFrame) -> float:\n",
" return (this - that).abs().max()"
]
},
{
"cell_type": "code",
"execution_count": 112,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"8933"
]
},
"execution_count": 112,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"chebyshev_distance(df.iloc[0], df.iloc[1])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Minkowki Distance\n",
"\n",
"* the generalized $L_p$-norm of the difference. \n",
"* additional argument $p$"
]
},
{
"cell_type": "code",
"execution_count": 113,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def minkowski_distance(this: DataFrame, that: DataFrame, p: float) -> float:\n",
" return ((this - that).abs() ** p).sum() ** (1.0 / p)"
]
},
{
"cell_type": "code",
"execution_count": 114,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"20750.088857099327"
]
},
"execution_count": 114,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"minkowski_distance(df.iloc[0], df.iloc[1], 3)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Canberra Distance\n",
"\n",
"* a weighted version of the Manhattan distance\n",
"* It is often used for data scattered around an origin\n",
" * it is biased for measures around the origin and very sensitive for values close to zero."
]
},
{
"cell_type": "code",
"execution_count": 115,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def canberra_distance(this: DataFrame, that: DataFrame) -> float:\n",
" # this can have some divide by zeroes that are ignored using replace to remmove nan and inf\n",
" return ((this - that).abs() / this.abs() + that.abs()).replace([np.nan, np.inf], 0).sum()"
]
},
{
"cell_type": "code",
"execution_count": 116,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"425381.53926460177"
]
},
"execution_count": 116,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"canberra_distance(df.iloc[0], df.iloc[1])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Cosine Distance\n",
"\n",
"* contains the dot product scaled by the product of the Euclidean distances from the origin. \n",
"* represents the angular distance of two vectors while ignoring their scale."
]
},
{
"cell_type": "code",
"execution_count": 117,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def cosine_distance(this: DataFrame, that: DataFrame) -> float:\n",
" num = (this * that).sum()\n",
" den = ((this ** 2).sum() ** (0.5)) * ((that ** 2).sum() ** (0.5))\n",
" return num / den"
]
},
{
"cell_type": "code",
"execution_count": 118,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.75156642203019353"
]
},
"execution_count": 118,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"cosine_distance(df.iloc[0], df.iloc[1])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Pearson's Distance\n",
"\n",
"The Pearson distance is a correlation distance based on Pearson's product-momentum correlation coefficient of the two sample vectors. Since the correlation coefficient falls between [-1, 1], the Pearson distance lies in [0, 2] and measures the linear relationship between the two vectors."
]
},
{
"cell_type": "code",
"execution_count": 119,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def pearsons_distance(this: DataFrame, that: DataFrame) -> float:\n",
" return this.corr(that)"
]
},
{
"cell_type": "code",
"execution_count": 120,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.096958032257038002"
]
},
"execution_count": 120,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pearsons_distance(df.iloc[0], df.iloc[1])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Hamming Distance\n",
"\n",
"the number of items in the vector that dont match"
]
},
{
"cell_type": "code",
"execution_count": 121,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def hamming_distance(this: DataFrame, that: DataFrame) -> float:\n",
" return (this != that).sum()"
]
},
{
"cell_type": "code",
"execution_count": 122,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"95"
]
},
"execution_count": 122,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"hamming_distance(df.iloc[0], df.iloc[1])"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.2"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment