Last active
September 24, 2020 11:52
-
-
Save yongjun21/7f45ac3649fa6e4dfb3b8d4f8d32f1f5 to your computer and use it in GitHub Desktop.
Neat little trick to efficiently compute Euclidean distance in pandas
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import numpy as np\n", | |
"import pandas as pd" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import sqlite3\n", | |
"\n", | |
"conn = sqlite3.connect('data/home_sales.db')\n", | |
"data = pd.read_sql('select * from sales', conn)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>latitude</th>\n", | |
" <th>longitude</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>47.5112</td>\n", | |
" <td>-122.257</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>47.7210</td>\n", | |
" <td>-122.319</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>47.7379</td>\n", | |
" <td>-122.233</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>47.5208</td>\n", | |
" <td>-122.393</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>47.6168</td>\n", | |
" <td>-122.045</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" latitude longitude\n", | |
"0 47.5112 -122.257\n", | |
"1 47.7210 -122.319\n", | |
"2 47.7379 -122.233\n", | |
"3 47.5208 -122.393\n", | |
"4 47.6168 -122.045" | |
] | |
}, | |
"execution_count": 3, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"data[['latitude', 'longitude']].head()" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Project `latitude` & `longitude` to a coordinate system where `x` and `y` share the same unit" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"0 (392765.0276081249, 57828.29664632535)\n", | |
"1 (388545.84479586943, 81237.94165904315)\n", | |
"2 (395030.97139233735, 82995.64590191713)\n", | |
"3 (382544.0587780627, 59093.97027054378)\n", | |
"4 (408916.9345252828, 69294.09407877702)\n", | |
"Name: xy, dtype: object" | |
] | |
}, | |
"execution_count": 4, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"from pyproj import Transformer\n", | |
"\n", | |
"transformer = Transformer.from_crs(\"EPSG:4326\", \"EPSG:2855\", always_xy=True)\n", | |
"\n", | |
"def project2855(row):\n", | |
" if pd.isna(row['longitude']) or pd.isna(row['latitude']): return None\n", | |
" return transformer.transform(row['longitude'], row['latitude'])\n", | |
"\n", | |
"data['xy'] = data.apply(project2855, axis=1)\n", | |
"data['xy'].head()" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Compute Euclidean distance (without vectorization)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"0 0.000000\n", | |
"1 23786.823731\n", | |
"2 25269.150555\n", | |
"3 10299.035564\n", | |
"4 19807.791594\n", | |
"Name: xy, dtype: float64" | |
] | |
}, | |
"execution_count": 5, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"def euclidean(a, b):\n", | |
" return ((a[0] - b[0]) ** 2 + (a[1] - b[1]) ** 2) ** 0.5\n", | |
"\n", | |
"ref_xy = data.loc[0, 'xy']\n", | |
"\n", | |
"distance = data['xy'].apply(lambda xy: xy and euclidean(xy, ref_xy))\n", | |
"distance.head()" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Vectorized code\n", | |
"\n", | |
"Less readable and many intermediate variables" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"0 0.000000\n", | |
"1 23786.823731\n", | |
"2 25269.150555\n", | |
"3 10299.035564\n", | |
"4 19807.791594\n", | |
"dtype: float64" | |
] | |
}, | |
"execution_count": 6, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"data['x'] = data['xy'].apply(lambda xy: xy and xy[0])\n", | |
"data['y'] = data['xy'].apply(lambda xy: xy and xy[1])\n", | |
"\n", | |
"ref_x, ref_y = ref_xy\n", | |
"\n", | |
"distance = ((data['x'] - ref_x) ** 2 + (data['y'] - ref_y) ** 2) ** 0.5\n", | |
"distance.head()" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Cast `xy` into complex data type" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"0 392765.027608+57828.296646j\n", | |
"1 388545.844796+81237.941659j\n", | |
"2 395030.971392+82995.645902j\n", | |
"3 382544.058778+59093.970271j\n", | |
"4 408916.934525+69294.094079j\n", | |
"Name: xy, dtype: complex128" | |
] | |
}, | |
"execution_count": 7, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"data['xy'] = data['x'] + data['y'] * 1j\n", | |
"data['xy'].head()" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Compute L2 norm (Euclidean distance) using `numpy.absolute`\n", | |
"\n", | |
"Clean and readable" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"0 0.000000\n", | |
"1 23786.823731\n", | |
"2 25269.150555\n", | |
"3 10299.035564\n", | |
"4 19807.791594\n", | |
"Name: xy, dtype: float64" | |
] | |
}, | |
"execution_count": 8, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"ref_xy = ref_x + ref_y * 1j\n", | |
"\n", | |
"distance = (data['xy'] - ref_xy).abs()\n", | |
"distance.head()" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Convert complex `xy` to real values for serialization" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"data['x'] = data['xy'].apply(np.real)\n", | |
"data['y'] = data['xy'].apply(np.imag)\n", | |
"data.drop('xy', axis=1, inplace=True)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Reconstitute complex `xy` from deserialized values" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"data['xy'] = data['x'] + data['y'] * 1j" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.7.8" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 4 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment