Created
September 25, 2017 23:41
-
-
Save Kautenja/6f00dc10e42a65ccd1f0ed1d4512d550 to your computer and use it in GitHub Desktop.
Implementations of different distance functions in Python
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Math courtesy of https://numerics.mathdotnet.com/distance.html" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 94, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"import numpy as np\n", | |
"import pandas as pd\n", | |
"from pandas import DataFrame\n", | |
"from numpy.random import randint, seed" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 95, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"# the printable 95 characters in the ASCII table\n", | |
"PRINTABLE = list(' !\"#$%&\\'()*+,-./:;<=>?@[]\\\\^_`~}{|0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 96, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"def random_vector(random_state: int, columns: list=PRINTABLE, low: int=0, high: int=10000) -> pd.DataFrame:\n", | |
" \"\"\"\n", | |
" Return a random vector.\n", | |
" \n", | |
" Args:\n", | |
" random_state: the random state to use\n", | |
" columns: the name of the random columns\n", | |
" low: the low value for the random range\n", | |
" high: the high value for the random range\n", | |
" \n", | |
" Returns: a random dataframe with the columns\n", | |
" \"\"\"\n", | |
" if random_state is not None:\n", | |
" seed(random_state)\n", | |
" values = randint(low=low, high=high, size=len(columns)).reshape(1, len(columns))\n", | |
" return pd.DataFrame(values, columns=columns) " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 97, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style>\n", | |
" .dataframe thead tr:only-child th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: left;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th>!</th>\n", | |
" <th>\"</th>\n", | |
" <th>#</th>\n", | |
" <th>$</th>\n", | |
" <th>%</th>\n", | |
" <th>&</th>\n", | |
" <th>'</th>\n", | |
" <th>(</th>\n", | |
" <th>)</th>\n", | |
" <th>...</th>\n", | |
" <th>Q</th>\n", | |
" <th>R</th>\n", | |
" <th>S</th>\n", | |
" <th>T</th>\n", | |
" <th>U</th>\n", | |
" <th>V</th>\n", | |
" <th>W</th>\n", | |
" <th>X</th>\n", | |
" <th>Y</th>\n", | |
" <th>Z</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>235</td>\n", | |
" <td>5192</td>\n", | |
" <td>905</td>\n", | |
" <td>7813</td>\n", | |
" <td>2895</td>\n", | |
" <td>5056</td>\n", | |
" <td>144</td>\n", | |
" <td>4225</td>\n", | |
" <td>7751</td>\n", | |
" <td>3462</td>\n", | |
" <td>...</td>\n", | |
" <td>4059</td>\n", | |
" <td>715</td>\n", | |
" <td>6407</td>\n", | |
" <td>6221</td>\n", | |
" <td>2760</td>\n", | |
" <td>5195</td>\n", | |
" <td>7500</td>\n", | |
" <td>1067</td>\n", | |
" <td>7710</td>\n", | |
" <td>9764</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"<p>1 rows × 95 columns</p>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" ! \" # $ % & ' ( ) ... Q R \\\n", | |
"0 235 5192 905 7813 2895 5056 144 4225 7751 3462 ... 4059 715 \n", | |
"\n", | |
" S T U V W X Y Z \n", | |
"0 6407 6221 2760 5195 7500 1067 7710 9764 \n", | |
"\n", | |
"[1 rows x 95 columns]" | |
] | |
}, | |
"execution_count": 97, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"random_vector(1)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 98, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"def random_tensor(random_state: int, size: int) -> pd.DataFrame:\n", | |
" \"\"\"\n", | |
" Return a random tensor.\n", | |
" \n", | |
" Args:\n", | |
" random_state: the random state for the tensor. will use\n", | |
" [random_state, random_state + size] states\n", | |
" size: the size of the tensor to generate\n", | |
" \n", | |
" Returns: a random dataframe of `size` rows\n", | |
" \"\"\"\n", | |
" return pd.concat([random_vector(random_state + i) for i in range (0, size)])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 100, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style>\n", | |
" .dataframe thead tr:only-child th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: left;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" <th>!</th>\n", | |
" <th>\"</th>\n", | |
" <th>#</th>\n", | |
" <th>$</th>\n", | |
" <th>%</th>\n", | |
" <th>&</th>\n", | |
" <th>'</th>\n", | |
" <th>(</th>\n", | |
" <th>)</th>\n", | |
" <th>...</th>\n", | |
" <th>Q</th>\n", | |
" <th>R</th>\n", | |
" <th>S</th>\n", | |
" <th>T</th>\n", | |
" <th>U</th>\n", | |
" <th>V</th>\n", | |
" <th>W</th>\n", | |
" <th>X</th>\n", | |
" <th>Y</th>\n", | |
" <th>Z</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>2732</td>\n", | |
" <td>9845</td>\n", | |
" <td>3264</td>\n", | |
" <td>4859</td>\n", | |
" <td>9225</td>\n", | |
" <td>7891</td>\n", | |
" <td>4373</td>\n", | |
" <td>5874</td>\n", | |
" <td>6744</td>\n", | |
" <td>3468</td>\n", | |
" <td>...</td>\n", | |
" <td>973</td>\n", | |
" <td>4464</td>\n", | |
" <td>8393</td>\n", | |
" <td>2418</td>\n", | |
" <td>3455</td>\n", | |
" <td>6167</td>\n", | |
" <td>5819</td>\n", | |
" <td>6521</td>\n", | |
" <td>6242</td>\n", | |
" <td>7742</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>235</td>\n", | |
" <td>5192</td>\n", | |
" <td>905</td>\n", | |
" <td>7813</td>\n", | |
" <td>2895</td>\n", | |
" <td>5056</td>\n", | |
" <td>144</td>\n", | |
" <td>4225</td>\n", | |
" <td>7751</td>\n", | |
" <td>3462</td>\n", | |
" <td>...</td>\n", | |
" <td>4059</td>\n", | |
" <td>715</td>\n", | |
" <td>6407</td>\n", | |
" <td>6221</td>\n", | |
" <td>2760</td>\n", | |
" <td>5195</td>\n", | |
" <td>7500</td>\n", | |
" <td>1067</td>\n", | |
" <td>7710</td>\n", | |
" <td>9764</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"<p>2 rows × 95 columns</p>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" ! \" # $ % & ' ( ) ... Q \\\n", | |
"0 2732 9845 3264 4859 9225 7891 4373 5874 6744 3468 ... 973 \n", | |
"0 235 5192 905 7813 2895 5056 144 4225 7751 3462 ... 4059 \n", | |
"\n", | |
" R S T U V W X Y Z \n", | |
"0 4464 8393 2418 3455 6167 5819 6521 6242 7742 \n", | |
"0 715 6407 6221 2760 5195 7500 1067 7710 9764 \n", | |
"\n", | |
"[2 rows x 95 columns]" | |
] | |
}, | |
"execution_count": 100, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"df = random_tensor(0, 2)\n", | |
"df" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Sum of Absolute Difference (SAD)\n", | |
"\n", | |
"* equivalent to $L_1$ norm\n", | |
"* manhattan distance\n", | |
"* taxicab-norm" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 101, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"def manhattan_distance(this: DataFrame, that: DataFrame) -> float:\n", | |
" return (this - that).abs().sum()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 102, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"308704" | |
] | |
}, | |
"execution_count": 102, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"manhattan_distance(df.iloc[0], df.iloc[1])" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Sum of Squared Differences (SSD)\n", | |
"\n", | |
"* equivalent to squared $L_2$ norm\n", | |
"* euclidean norm\n", | |
"* sensitive to outliers" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 103, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"def euclidean_norm(this: DataFrame, that: DataFrame) -> float:\n", | |
" return ((this - that) ** 2).sum()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 104, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"1509487768" | |
] | |
}, | |
"execution_count": 104, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"euclidean_norm(df.iloc[0], df.iloc[1])" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Mean-Absolute Error (MAE)\n", | |
"\n", | |
"* normalized version of the sum of absolute difference.\n", | |
"* normalized manhattan distance" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 105, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"def mean_absolute_error(this: DataFrame, that: DataFrame) -> float:\n", | |
" return (this - that).abs().sum() / this.index.size" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 106, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"3249.5157894736844" | |
] | |
}, | |
"execution_count": 106, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"mean_absolute_error(df.iloc[0], df.iloc[1])" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Mean Squared Error (MSE)\n", | |
"\n", | |
"* normalized version of the sum of squared differences" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 107, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"def mean_squared_error(this: DataFrame, that: DataFrame) -> float:\n", | |
" return ((this - that) ** 2).sum() / this.index.size" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 108, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"15889344.92631579" | |
] | |
}, | |
"execution_count": 108, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"mean_squared_error(df.iloc[0], df.iloc[1])" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Euclidean Distance\n", | |
"\n", | |
"* $L_2$-norm of the difference\n", | |
"* a special case of the Minkowski distance with p=2. \n", | |
"* It is the natural distance in a geometric interpretation." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 109, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"def euclidean_distance(this: DataFrame, that: DataFrame) -> float:\n", | |
" return ((this - that) ** 2).sum() ** (0.5)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 110, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"38852.126943064519" | |
] | |
}, | |
"execution_count": 110, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"euclidean_distance(df.iloc[0], df.iloc[1])" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Chebyshev Distance\n", | |
"\n", | |
"* $L_{\\infty}$-norm of the difference\n", | |
"* a special case of the Minkowski distance where $p \\to \\infty$\n", | |
"* It is also known as Chessboard distance." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 111, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"def chebyshev_distance(this: DataFrame, that: DataFrame) -> float:\n", | |
" return (this - that).abs().max()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 112, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"8933" | |
] | |
}, | |
"execution_count": 112, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"chebyshev_distance(df.iloc[0], df.iloc[1])" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Minkowki Distance\n", | |
"\n", | |
"* the generalized $L_p$-norm of the difference. \n", | |
"* additional argument $p$" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 113, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"def minkowski_distance(this: DataFrame, that: DataFrame, p: float) -> float:\n", | |
" return ((this - that).abs() ** p).sum() ** (1.0 / p)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 114, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"20750.088857099327" | |
] | |
}, | |
"execution_count": 114, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"minkowski_distance(df.iloc[0], df.iloc[1], 3)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Canberra Distance\n", | |
"\n", | |
"* a weighted version of the Manhattan distance\n", | |
"* It is often used for data scattered around an origin\n", | |
" * it is biased for measures around the origin and very sensitive for values close to zero." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 115, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"def canberra_distance(this: DataFrame, that: DataFrame) -> float:\n", | |
" # this can have some divide by zeroes that are ignored using replace to remmove nan and inf\n", | |
" return ((this - that).abs() / this.abs() + that.abs()).replace([np.nan, np.inf], 0).sum()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 116, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"425381.53926460177" | |
] | |
}, | |
"execution_count": 116, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"canberra_distance(df.iloc[0], df.iloc[1])" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Cosine Distance\n", | |
"\n", | |
"* contains the dot product scaled by the product of the Euclidean distances from the origin. \n", | |
"* represents the angular distance of two vectors while ignoring their scale." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 117, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"def cosine_distance(this: DataFrame, that: DataFrame) -> float:\n", | |
" num = (this * that).sum()\n", | |
" den = ((this ** 2).sum() ** (0.5)) * ((that ** 2).sum() ** (0.5))\n", | |
" return num / den" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 118, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"0.75156642203019353" | |
] | |
}, | |
"execution_count": 118, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"cosine_distance(df.iloc[0], df.iloc[1])" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Pearson's Distance\n", | |
"\n", | |
"The Pearson distance is a correlation distance based on Pearson's product-momentum correlation coefficient of the two sample vectors. Since the correlation coefficient falls between [-1, 1], the Pearson distance lies in [0, 2] and measures the linear relationship between the two vectors." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 119, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"def pearsons_distance(this: DataFrame, that: DataFrame) -> float:\n", | |
" return this.corr(that)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 120, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"0.096958032257038002" | |
] | |
}, | |
"execution_count": 120, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"pearsons_distance(df.iloc[0], df.iloc[1])" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Hamming Distance\n", | |
"\n", | |
"the number of items in the vector that dont match" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 121, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"def hamming_distance(this: DataFrame, that: DataFrame) -> float:\n", | |
" return (this != that).sum()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 122, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"95" | |
] | |
}, | |
"execution_count": 122, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"hamming_distance(df.iloc[0], df.iloc[1])" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.6.2" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment