ethen8181 · February 20, 2017 03:09
diff --git a/hw2.py b/hw2.py
 # python3.5
 import os
 import heapq
 import joblib
 import numpy as np
 import pandas as pd
 from shapely.geometry import LineString, Point
 from sklearn.neighbors import NearestNeighbors
 from collections import defaultdict, namedtuple

 def read_data(folder):
    """read in probe and link data given the folder name"""
    probe_path = os.path.join(folder, 'Partition6467ProbePoints.csv')
    probe = pd.read_csv(probe_path, header = None)
    probe.columns = ['sampleID', 'dateTime', 'sourceCode', 'latitude',
                     'longitude', 'altitude', 'speed', 'heading']

    link_path = os.path.join(folder, 'Partition6467LinkData.csv')
    link = pd.read_csv(link_path, header = None)
    link.columns = ['linkPVID', 'refNodeID', 'nrefNodeID', 'length', 
                    'functionalClass', 'directionOfTravel', 'speedCategory', 
                    'fromRefSpeedLimit', 'toRefSpeedLimit', 'fromRefNumLanes', 
                    'toRefNumLanes', 'multiDigitized', 'urban', 'timeZone', 
                    'shapeInfo', 'curvatureInfo', 'slopeInfo']
    
    def parse_shape(shape_info):
        """extract the longitude and latitude of the shape"""
        nodes = shape_info.split('|')
        result = np.zeros(( len(nodes), 2), dtype = np.float)
        for idx, node in enumerate(nodes):
            position = node.split('/')[:2]
            result[idx] = position

        return result

    link['nodePositions'] = link['shapeInfo'].apply(lambda row: parse_shape(row))
    return probe, link


 def train_knn(link, probe_positions, k):
    """
    obtain a probe points nearest nodes and
    the link that those nodes belong to

    Parameters
    ----------
    link : DataFrame
        link data

    probe_positions : 2d numpy array
        probe's latitude and longitude

    k : int
        K-nearest neighbors' k

    Return
    ------
    neighbors : list
        each element of the list is a dictionary, the dictionary's
        key is the probe's nearest neighbor's node's position and the
        value is the link id that the node belongs to
    """
    # create the reversed dictionary, where the 
    # latitude and longitude of each node is the 
    # key and the link id is the value
    node_info = {}
    link_ids = link['linkPVID']
    nodes = link['nodePositions']
    for node, link_id in zip(nodes, link_ids):
        for n in node:
            node_info[tuple(n)] = link_id

    # prepare data and train the KNN model, to get the
    # k nearest nodes for each probe point
    node_positions = np.vstack(link['nodePositions'].values)
    model = NearestNeighbors(n_neighbors = k, metric = 'euclidean')
    model.fit(node_positions)
    _, indices = model.kneighbors(probe_positions)
    closest_nodes = node_positions[indices]

    # for each probe point's nearest node, obtain the link
    # that each node belongs to, the information is stored
    # in a dictionary where the key is the node position and
    # the value is the link that it corresponds to
    neighbors = []
    for i in range(closest_nodes.shape[0]):
        neighbor = {}
        for j in closest_nodes[i]:
            neighbor[tuple(j)] = node_info[tuple(j)]

        neighbors.append(neighbor)
    
    return neighbors


 def get_closest_link(query, neighbor):
    """
    given a query probe point and its
    nearest neighbors obtain the nearest link

    Parameter
    ---------
    query : 1d numpy array
        the probe's longitude and latitude

    neighbor : dict
        each key is the longitude and latitude of the nearest
        neighbors and the value is the corresponding link id

    Returns
    -------
    closest_link : int
        the query probe's closest link
    """
    nodes = defaultdict(list)
    dist_pair = namedtuple('dist_pair', ['dist', 'link', 'point'])

    for position, link_id in neighbor.items():
        nodes[link_id].append(position)

    dist_infos = []
    for link, node in nodes.items():
        if len(node) > 1:
            line = LineString(node) # ?can we pre-computed this
            dist = line.project(Point(query))
            point = line.interpolate(dist)
            point = point.x, point.y
        else:
            # if the only 1 node from the link is in the
            # nearest neighborhood, then simply compute
            # the euclidean distance from that node to the link
            diff = query - np.array(node[0])
            dist = np.sqrt( np.power(diff, 2).sum() )
            # ?change this to get another node from this link, and compute the projected distance
            point = node[0]

        dist_info = dist_pair(dist, link, point)
        dist_infos.append(dist_info)
    
    closest_info = heapq.nsmallest(1, dist_infos)[0]
    closest_link = closest_info.link
    closest_point = closest_info.point
    return closest_link, closest_point


 def create_output(probe_positions, neighbors):
    """
    create the probe's projected distance and 
    the link that it belongs to
    """
    link_ids = []
    projected_points = []
    for i, pos in enumerate(probe_positions):
        link_id, projected_point = get_closest_link(pos, neighbors[i])
        link_ids.append(link_id)
        projected_points.append(projected_point)

    df_link_id = pd.DataFrame(link_ids, columns = ['link_ids'])
    df_projected_points = pd.DataFrame(projected_points, 
                                       columns = ['projected_x', 'projected_y'])
    output = pd.concat([df_link_id, df_projected_points], axis = 1)
    output.to_csv('output.csv', index = False)
    return output


 def haversine_distance(probe, link):
    """
    distance metric for measuring two points on the earth,
    not used due to its computational inefficiency
    """
    probe = np.deg2rad(probe)
    link = np.deg2rad(link)
    hav1 = 1 - np.cos(probe[0] - link[0]) / 2
    hav2 = 1 - np.cos(probe[1] - link[1]) / 2
    hav = hav1 + np.cos(probe[0]) * np.cos(probe[1]) * hav2
    return 2 * np.arcsin(np.sqrt(hav))


 if __name__ == '__main__':
    # specify k nearest neighbor's K and the file path and
    # the script will output a .csv file under the current
    # directory, the file contains the probe's projected
    # latitude, longitude and the link id that it belongs to
    k = 10 
    folder = 'probe_data_map_matching'
    probe, link = read_data(folder)
    probe_positions = probe[['latitude', 'longitude']].values
    neighbors = train_knn(link, probe_positions, k)
    output = create_output(probe_positions, neighbors)
	# python3.5
	import os
	import heapq
	import joblib
	import numpy as np
	import pandas as pd
	from shapely.geometry import LineString, Point
	from sklearn.neighbors import NearestNeighbors
	from collections import defaultdict, namedtuple

	def read_data(folder):
	"""read in probe and link data given the folder name"""
	probe_path = os.path.join(folder, 'Partition6467ProbePoints.csv')
	probe = pd.read_csv(probe_path, header = None)
	probe.columns = ['sampleID', 'dateTime', 'sourceCode', 'latitude',
	'longitude', 'altitude', 'speed', 'heading']

	link_path = os.path.join(folder, 'Partition6467LinkData.csv')
	link = pd.read_csv(link_path, header = None)
	link.columns = ['linkPVID', 'refNodeID', 'nrefNodeID', 'length',
	'functionalClass', 'directionOfTravel', 'speedCategory',
	'fromRefSpeedLimit', 'toRefSpeedLimit', 'fromRefNumLanes',
	'toRefNumLanes', 'multiDigitized', 'urban', 'timeZone',
	'shapeInfo', 'curvatureInfo', 'slopeInfo']

	def parse_shape(shape_info):
	"""extract the longitude and latitude of the shape"""
	nodes = shape_info.split('\|')
	result = np.zeros(( len(nodes), 2), dtype = np.float)
	for idx, node in enumerate(nodes):
	position = node.split('/')[:2]
	result[idx] = position

	return result

	link['nodePositions'] = link['shapeInfo'].apply(lambda row: parse_shape(row))
	return probe, link


	def train_knn(link, probe_positions, k):
	"""
	obtain a probe points nearest nodes and
	the link that those nodes belong to

	Parameters
	----------
	link : DataFrame
	link data

	probe_positions : 2d numpy array
	probe's latitude and longitude

	k : int
	K-nearest neighbors' k

	Return
	------
	neighbors : list
	each element of the list is a dictionary, the dictionary's
	key is the probe's nearest neighbor's node's position and the
	value is the link id that the node belongs to
	"""
	# create the reversed dictionary, where the
	# latitude and longitude of each node is the
	# key and the link id is the value
	node_info = {}
	link_ids = link['linkPVID']
	nodes = link['nodePositions']
	for node, link_id in zip(nodes, link_ids):
	for n in node:
	node_info[tuple(n)] = link_id

	# prepare data and train the KNN model, to get the
	# k nearest nodes for each probe point
	node_positions = np.vstack(link['nodePositions'].values)
	model = NearestNeighbors(n_neighbors = k, metric = 'euclidean')
	model.fit(node_positions)
	_, indices = model.kneighbors(probe_positions)
	closest_nodes = node_positions[indices]

	# for each probe point's nearest node, obtain the link
	# that each node belongs to, the information is stored
	# in a dictionary where the key is the node position and
	# the value is the link that it corresponds to
	neighbors = []
	for i in range(closest_nodes.shape[0]):
	neighbor = {}
	for j in closest_nodes[i]:
	neighbor[tuple(j)] = node_info[tuple(j)]

	neighbors.append(neighbor)

	return neighbors


	def get_closest_link(query, neighbor):
	"""
	given a query probe point and its
	nearest neighbors obtain the nearest link

	Parameter
	---------
	query : 1d numpy array
	the probe's longitude and latitude

	neighbor : dict
	each key is the longitude and latitude of the nearest
	neighbors and the value is the corresponding link id

	Returns
	-------
	closest_link : int
	the query probe's closest link
	"""
	nodes = defaultdict(list)
	dist_pair = namedtuple('dist_pair', ['dist', 'link', 'point'])

	for position, link_id in neighbor.items():
	nodes[link_id].append(position)

	dist_infos = []
	for link, node in nodes.items():
	if len(node) > 1:
	line = LineString(node) # ?can we pre-computed this
	dist = line.project(Point(query))
	point = line.interpolate(dist)
	point = point.x, point.y
	else:
	# if the only 1 node from the link is in the
	# nearest neighborhood, then simply compute
	# the euclidean distance from that node to the link
	diff = query - np.array(node[0])
	dist = np.sqrt( np.power(diff, 2).sum() )
	# ?change this to get another node from this link, and compute the projected distance
	point = node[0]

	dist_info = dist_pair(dist, link, point)
	dist_infos.append(dist_info)

	closest_info = heapq.nsmallest(1, dist_infos)[0]
	closest_link = closest_info.link
	closest_point = closest_info.point
	return closest_link, closest_point


	def create_output(probe_positions, neighbors):
	"""
	create the probe's projected distance and
	the link that it belongs to
	"""
	link_ids = []
	projected_points = []
	for i, pos in enumerate(probe_positions):
	link_id, projected_point = get_closest_link(pos, neighbors[i])
	link_ids.append(link_id)
	projected_points.append(projected_point)

	df_link_id = pd.DataFrame(link_ids, columns = ['link_ids'])
	df_projected_points = pd.DataFrame(projected_points,
	columns = ['projected_x', 'projected_y'])
	output = pd.concat([df_link_id, df_projected_points], axis = 1)
	output.to_csv('output.csv', index = False)
	return output


	def haversine_distance(probe, link):
	"""
	distance metric for measuring two points on the earth,
	not used due to its computational inefficiency
	"""
	probe = np.deg2rad(probe)
	link = np.deg2rad(link)
	hav1 = 1 - np.cos(probe[0] - link[0]) / 2
	hav2 = 1 - np.cos(probe[1] - link[1]) / 2
	hav = hav1 + np.cos(probe[0]) * np.cos(probe[1]) * hav2
	return 2 * np.arcsin(np.sqrt(hav))


	if __name__ == '__main__':
	# specify k nearest neighbor's K and the file path and
	# the script will output a .csv file under the current
	# directory, the file contains the probe's projected
	# latitude, longitude and the link id that it belongs to
	k = 10
	folder = 'probe_data_map_matching'
	probe, link = read_data(folder)
	probe_positions = probe[['latitude', 'longitude']].values
	neighbors = train_knn(link, probe_positions, k)
	output = create_output(probe_positions, neighbors)