Skip to content

Instantly share code, notes, and snippets.

@internaut
Last active June 12, 2020 14:30
Show Gist options
  • Save internaut/5a653317688b14fd0fc67214c1352831 to your computer and use it in GitHub Desktop.
Save internaut/5a653317688b14fd0fc67214c1352831 to your computer and use it in GitHub Desktop.
Shows how to do a cross join (i.e. cartesian product) between two pandas DataFrames using an example on calculating the distances between origin and destination cities. See https://mkonrad.net/2016/04/16/cross-join--cartesian-product-between-pandas-dataframes.html
"""
Shows how to do a cross join (i.e. cartesian product) between two pandas DataFrames using an example on
calculating the distances between origin and destination cities.
Tested with pandas 0.17.1 and 0.18 on Python 3.4 and Python 3.5
Best run this with Spyder (see https://github.com/spyder-ide/spyder)
Author: Markus Konrad <[email protected]>
April 2016
"""
import pandas as pd
import math
#%% define some functions that we'll need
def df_crossjoin(df1, df2, **kwargs):
"""
Make a cross join (cartesian product) between two dataframes by using a constant temporary key.
Also sets a MultiIndex which is the cartesian product of the indices of the input dataframes.
See: https://github.com/pydata/pandas/issues/5401
:param df1 dataframe 1
:param df1 dataframe 2
:param kwargs keyword arguments that will be passed to pd.merge()
:return cross join of df1 and df2
"""
df1['_tmpkey'] = 1
df2['_tmpkey'] = 1
res = pd.merge(df1, df2, on='_tmpkey', **kwargs).drop('_tmpkey', axis=1)
res.index = pd.MultiIndex.from_product((df1.index, df2.index))
df1.drop('_tmpkey', axis=1, inplace=True)
df2.drop('_tmpkey', axis=1, inplace=True)
return res
def haversine(p1, p2):
"""
Calculate distance between two points on earth in km
See: http://www.movable-type.co.uk/scripts/latlong.html
:param p1 point 1 tuple (latitude, longitude)
:param p2 point 2 tuple (latitude, longitude)
:return distance between points p1 and p2 on earth in km
"""
R = 6371 # earth radius in km
p1 = [math.radians(v) for v in p1]
p2 = [math.radians(v) for v in p2]
d_lat = p2[0] - p1[0]
d_lng = p2[1] - p1[1]
a = math.pow(math.sin(d_lat / 2), 2) + math.cos(p1[0]) * math.cos(p2[0]) * math.pow(math.sin(d_lng / 2), 2)
c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))
return R * c
#%% define the data
origin_cities = ['Berlin', 'Hamburg', 'Munich']
origin_coords = {
'lat': [52.518611, 53.550556, 48.137222],
'lng': [13.408333, 9.993333, 11.575556]
}
destination_cities = ['London', 'New York City', 'Moscow', 'Sydney', 'Istanbul']
destination_coords = {
'lat': [51.50939, 40.712778, 55.75, -33.85, 41.01],
'lng': [-0.11832, -74.005833, 37.616667, 151.2, 28.960278]
}
df_orig = pd.DataFrame(origin_coords, index=origin_cities)
df_dest = pd.DataFrame(destination_coords, index=destination_cities)
print(df_orig)
print(df_dest)
#%% perform the cross join
dfx = df_crossjoin(df_orig, df_dest, suffixes=('_orig', '_dest'))
print(dfx)
#%% calculate the distances by applying the haversine() function to each row
dist = dfx.apply(lambda row: haversine((row['lat_orig'], row['lng_orig']), (row['lat_dest'], row['lng_dest'])),
axis=1)
print(dist)
#%% get the 3 smallest distances per origin city
nearest3 = dist.groupby(level=0).nsmallest(3)
print(nearest3)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment