Skip to content

Instantly share code, notes, and snippets.

@ethen8181
Last active February 20, 2017 03:09
Show Gist options
  • Save ethen8181/b670b0218defce3588e5cd2292e87d68 to your computer and use it in GitHub Desktop.
Save ethen8181/b670b0218defce3588e5cd2292e87d68 to your computer and use it in GitHub Desktop.
geospatial
# python3.5
import os
import heapq
import joblib
import numpy as np
import pandas as pd
from shapely.geometry import LineString, Point
from sklearn.neighbors import NearestNeighbors
from collections import defaultdict, namedtuple
def read_data(folder):
"""read in probe and link data given the folder name"""
probe_path = os.path.join(folder, 'Partition6467ProbePoints.csv')
probe = pd.read_csv(probe_path, header = None)
probe.columns = ['sampleID', 'dateTime', 'sourceCode', 'latitude',
'longitude', 'altitude', 'speed', 'heading']
link_path = os.path.join(folder, 'Partition6467LinkData.csv')
link = pd.read_csv(link_path, header = None)
link.columns = ['linkPVID', 'refNodeID', 'nrefNodeID', 'length',
'functionalClass', 'directionOfTravel', 'speedCategory',
'fromRefSpeedLimit', 'toRefSpeedLimit', 'fromRefNumLanes',
'toRefNumLanes', 'multiDigitized', 'urban', 'timeZone',
'shapeInfo', 'curvatureInfo', 'slopeInfo']
def parse_shape(shape_info):
"""extract the longitude and latitude of the shape"""
nodes = shape_info.split('|')
result = np.zeros(( len(nodes), 2), dtype = np.float)
for idx, node in enumerate(nodes):
position = node.split('/')[:2]
result[idx] = position
return result
link['nodePositions'] = link['shapeInfo'].apply(lambda row: parse_shape(row))
return probe, link
def train_knn(link, probe_positions, k):
"""
obtain a probe points nearest nodes and
the link that those nodes belong to
Parameters
----------
link : DataFrame
link data
probe_positions : 2d numpy array
probe's latitude and longitude
k : int
K-nearest neighbors' k
Return
------
neighbors : list
each element of the list is a dictionary, the dictionary's
key is the probe's nearest neighbor's node's position and the
value is the link id that the node belongs to
"""
# create the reversed dictionary, where the
# latitude and longitude of each node is the
# key and the link id is the value
node_info = {}
link_ids = link['linkPVID']
nodes = link['nodePositions']
for node, link_id in zip(nodes, link_ids):
for n in node:
node_info[tuple(n)] = link_id
# prepare data and train the KNN model, to get the
# k nearest nodes for each probe point
node_positions = np.vstack(link['nodePositions'].values)
model = NearestNeighbors(n_neighbors = k, metric = 'euclidean')
model.fit(node_positions)
_, indices = model.kneighbors(probe_positions)
closest_nodes = node_positions[indices]
# for each probe point's nearest node, obtain the link
# that each node belongs to, the information is stored
# in a dictionary where the key is the node position and
# the value is the link that it corresponds to
neighbors = []
for i in range(closest_nodes.shape[0]):
neighbor = {}
for j in closest_nodes[i]:
neighbor[tuple(j)] = node_info[tuple(j)]
neighbors.append(neighbor)
return neighbors
def get_closest_link(query, neighbor):
"""
given a query probe point and its
nearest neighbors obtain the nearest link
Parameter
---------
query : 1d numpy array
the probe's longitude and latitude
neighbor : dict
each key is the longitude and latitude of the nearest
neighbors and the value is the corresponding link id
Returns
-------
closest_link : int
the query probe's closest link
"""
nodes = defaultdict(list)
dist_pair = namedtuple('dist_pair', ['dist', 'link', 'point'])
for position, link_id in neighbor.items():
nodes[link_id].append(position)
dist_infos = []
for link, node in nodes.items():
if len(node) > 1:
line = LineString(node) # ?can we pre-computed this
dist = line.project(Point(query))
point = line.interpolate(dist)
point = point.x, point.y
else:
# if the only 1 node from the link is in the
# nearest neighborhood, then simply compute
# the euclidean distance from that node to the link
diff = query - np.array(node[0])
dist = np.sqrt( np.power(diff, 2).sum() )
# ?change this to get another node from this link, and compute the projected distance
point = node[0]
dist_info = dist_pair(dist, link, point)
dist_infos.append(dist_info)
closest_info = heapq.nsmallest(1, dist_infos)[0]
closest_link = closest_info.link
closest_point = closest_info.point
return closest_link, closest_point
def create_output(probe_positions, neighbors):
"""
create the probe's projected distance and
the link that it belongs to
"""
link_ids = []
projected_points = []
for i, pos in enumerate(probe_positions):
link_id, projected_point = get_closest_link(pos, neighbors[i])
link_ids.append(link_id)
projected_points.append(projected_point)
df_link_id = pd.DataFrame(link_ids, columns = ['link_ids'])
df_projected_points = pd.DataFrame(projected_points,
columns = ['projected_x', 'projected_y'])
output = pd.concat([df_link_id, df_projected_points], axis = 1)
output.to_csv('output.csv', index = False)
return output
def haversine_distance(probe, link):
"""
distance metric for measuring two points on the earth,
not used due to its computational inefficiency
"""
probe = np.deg2rad(probe)
link = np.deg2rad(link)
hav1 = 1 - np.cos(probe[0] - link[0]) / 2
hav2 = 1 - np.cos(probe[1] - link[1]) / 2
hav = hav1 + np.cos(probe[0]) * np.cos(probe[1]) * hav2
return 2 * np.arcsin(np.sqrt(hav))
if __name__ == '__main__':
# specify k nearest neighbor's K and the file path and
# the script will output a .csv file under the current
# directory, the file contains the probe's projected
# latitude, longitude and the link id that it belongs to
k = 10
folder = 'probe_data_map_matching'
probe, link = read_data(folder)
probe_positions = probe[['latitude', 'longitude']].values
neighbors = train_knn(link, probe_positions, k)
output = create_output(probe_positions, neighbors)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment