Skip to content

Instantly share code, notes, and snippets.

@n-bar
Created March 20, 2017 08:39
Show Gist options
  • Save n-bar/0ac7a64d493b799db811781be023fa50 to your computer and use it in GitHub Desktop.
Save n-bar/0ac7a64d493b799db811781be023fa50 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"TestResults(failed=0, attempted=2)\n"
]
}
],
"source": [
"import math\n",
"import doctest\n",
"\n",
"R = 6371.0 # radius of the earth in km\n",
"dlat = 6371.0 / 180 * math.pi # ≈ 111.19 km\n",
"\n",
"deg_rd = 2 * math.pi / 360. # radians\n",
"\n",
"def euclid_dist(lat1, lon1, lat2, lon2):\n",
" \"\"\"\n",
" >>> euclid_dist(1, 0, -1, 0) == 2 * dlat\n",
" True\n",
" >>> euclid_dist(1, 1, 1, 1) == 0\n",
" True\n",
" \"\"\"\n",
" \n",
" coef = math.cos( (lat2 + lat1) * .5 * deg_rd) # degree -> radians; (lat2 + lat1) * .5 / 180.0\n",
" x = (lon2 - lon1) * coef\n",
" y = lat2 - lat1\n",
" d = (x*x + y*y)**.5 * dlat\n",
" \n",
" return d\n",
"\n",
"if __name__ == \"__main__\":\n",
" print(doctest.testmod())"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(50.194, -85.499, -80.99, -49.822)\n"
]
}
],
"source": [
"from random import randint, seed\n",
"\n",
"\n",
"seed(42)\n",
"\n",
"rnd_lat = lambda lat=180: randint(-lat*1e3, lat*1e3) / 1e3\n",
"rnd_lon = lambda lon=90: randint(-lon*1e3, lon*1e3) / 1e3\n",
"\n",
"p = rnd_lat(), rnd_lon(), rnd_lat(), rnd_lon()\n",
"print(p)\n",
"lat1, lon1, lat2, lon2 = p"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"%load_ext line_profiler"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1000000 loops, best of 3: 1.25 µs per loop\n"
]
}
],
"source": [
"%timeit -n 1000000 -r 3 euclid_dist(lat1, lon1, lat2, lon2)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"%lprun -f euclid_dist euclid_dist(lat1, lon1, lat2, lon2)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Python"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"def euclid_dist(lat1, lon1, lat2, lon2): \n",
" x = (lon2 - lon1) * math.cos((lat1 + lat2) * .5 * deg_rd)\n",
" y = lat2 - lat1\n",
" return (x*x + y*y)**.5 * dlat"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1000000 loops, best of 3: 1.16 µs per loop\n"
]
}
],
"source": [
"%timeit -n 1000000 -r 3 euclid_dist(lat1, lon1, lat2, lon2)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"%lprun -f euclid_dist euclid_dist(lat1, lon1, lat2, lon2)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Numba"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"import numba\n",
"\n",
"\n",
"@numba.jit([\"float64(float64, float64, float64, float64)\"], nopython=True, target='cpu', cache=True)\n",
"def nu_euclid_dist(lat1, lon1, lat2, lon2): \n",
" x = (lon2 - lon1) * math.cos((lat1 + lat2) * .5 * deg_rd)\n",
" y = lat2 - lat1\n",
" return (x*x + y*y)**.5 * dlat"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1000000 loops, best of 3: 456 ns per loop\n"
]
}
],
"source": [
"%timeit -n 1000000 -r 3 nu_euclid_dist(lat1, lon1, lat2, lon2)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Cython"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"%load_ext cython"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"%%cython\n",
"cimport cython\n",
"from libc.math cimport cos, pi, sqrt\n",
"\n",
"\n",
"cdef double R = 6371.0 # radius of the earth in km\n",
"cdef double dlat = R / 180.0 * pi # ≈ 111.19 km\n",
"cdef double deg_rd = 2 * pi / 360.\n",
"\n",
"def cy_euclid_dist(double lat1, double lon1, double lat2, double lon2):\n",
" cdef double lat_avg_rad = (lat2 + lat1) *.5 * deg_rd\n",
" cdef double coeff = cos(lat_avg_rad)\n",
" cdef double x = (lon2 - lon1) * coeff\n",
" cdef double y = lat2 - lat1\n",
"\n",
" return sqrt(x*x + y*y) * dlat"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1000000 loops, best of 3: 329 ns per loop\n"
]
}
],
"source": [
"%timeit -n 1000000 -r 3 cy_euclid_dist(lat1, lon1, lat2, lon2)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Collections"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"[(-0.923, 0.393), (-0.712, -0.075)]"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"seed(43)\n",
"target_points = [(rnd_lat(1), rnd_lon(1)) for _ in range(10)]\n",
"target_points[:2]"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"[(-0.024, -0.323), (-0.837, -0.856)]"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"seed(45)\n",
"rows = int(1e5)\n",
"checked_points = [[(rnd_lat(1), rnd_lon(1)) for _ in range(randint(1, 5))] for _ in range(rows)]\n",
"checked_points = [zip(*p) for p in checked_points]\n",
"checked_points[0]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Задача - подсчитать количество строк где есть хоть одна координата на допустимом растоянии от проверямой коллекции точек."
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1.62%\n"
]
}
],
"source": [
"class HasGoodPoint:\n",
" def __init__(self, lats, lons):\n",
" assert len(lats) == len(lons), u\"Длины списков должны совпадать\"\n",
" self.lats = lats\n",
" self.lons = lons\n",
" self.lat_max = max(self.lats) + .5\n",
" self.lon_max = max(self.lons) + .5\n",
" self.lat_min = min(self.lats) - .5\n",
" self.lon_min = min(self.lons) - .5\n",
" self.idx_1 = range(len(self.lats))\n",
" \n",
" def __call__(self, lats, lons, max_dist_km):\n",
" assert 0 < max_dist_km <= 10, u\"max_dist_km вне допустимого диапазона: (0, 10]\"\n",
" assert len(lats) == len(lons), u\"Длины списков должны совпадать\"\n",
" idx_2 = len(lats)\n",
" for i in range(idx_2):\n",
" lat_2, lon_2 = lats[i], lons[i]\n",
" if not self.lat_min <= lat_2 <= self.lat_max:\n",
" continue\n",
" if not self.lon_min <= lon_2 <= self.lon_max:\n",
" continue\n",
" for x in self.idx_1:\n",
" if euclid_dist(self.lats[x], self.lons[x], lat_2, lon_2) <= max_dist_km:\n",
" return True\n",
" return False\n",
"\n",
"lats, lons = zip(*target_points)\n",
"has_point = HasGoodPoint(lats, lons)\n",
"print(\"%.2f%%\" % (sum(has_point(*p, max_dist_km=3) for p in checked_points) / float(rows) * 100))"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"2 loops, best of 3: 4.83 s per loop\n",
"rows/sec: 20047\n"
]
}
],
"source": [
"t = %timeit -on 2 -r 3 sum(has_point(*p, max_dist_km=3) for p in checked_points)\n",
"print(\"rows/sec: %.f\" % ((t.repeat * t.loops * rows) / sum(t.all_runs)))"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1.62%\n"
]
}
],
"source": [
"def HasGoodPoint(lats, lons):\n",
" assert len(lats) == len(lons), u\"Длины списков должны совпадать\"\n",
" lat_max = max(lats) + .5\n",
" lon_max = max(lons) + .5\n",
" lat_min = min(lats) - .5\n",
" lon_min = min(lons) - .5\n",
" idx_1 = range(len(lats))\n",
" \n",
" def has(lats_2, lons_2, max_dist_km):\n",
"\n",
" assert 0 < max_dist_km <= 10, u\"max_dist_km вне допустимого диапазона: (0, 10]\"\n",
" assert len(lats) == len(lons), u\"Длины списков должны совпадать\"\n",
" idx_2 = len(lats_2)\n",
"\n",
" for i in range(idx_2):\n",
" lat_2, lon_2 = lats_2[i], lons_2[i]\n",
" if not lat_min <= lat_2 <= lat_max:\n",
" continue\n",
" if not lon_min <= lon_2 <= lon_max:\n",
" continue\n",
" for x in idx_1:\n",
" if euclid_dist(lats[x], lons[x], lat_2, lon_2) <= max_dist_km:\n",
" return True\n",
" return False\n",
" \n",
" return has\n",
"\n",
"has_point = HasGoodPoint(lats, lons)\n",
"print(\"%.2f%%\" % (sum(has_point(*p, max_dist_km=3) for p in checked_points) / float(rows) * 100))"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"2 loops, best of 3: -5.83e+09 ns per loop\n",
"rows/sec: 80413\n"
]
}
],
"source": [
"t = %timeit -on 2 -r 3 sum(has_point(*p, max_dist_km=3) for p in checked_points)\n",
"print(\"rows/sec: %.f\" % ((t.repeat * t.loops * rows) / sum(t.all_runs)))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Numba"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1.62%\n"
]
}
],
"source": [
"def HasGoodPoint(lats, lons):\n",
" assert len(lats) == len(lons), u\"Длины списков должны совпадать\"\n",
" lat_max = max(lats) + .5\n",
" lon_max = max(lons) + .5\n",
" lat_min = min(lats) - .5\n",
" lon_min = min(lons) - .5\n",
" idx_1 = range(len(lats))\n",
" \n",
" def has(lats_2, lons_2, max_dist_km):\n",
"\n",
" assert 0 < max_dist_km <= 10, u\"max_dist_km вне допустимого диапазона: (0, 10]\"\n",
" assert len(lats) == len(lons), u\"Длины списков должны совпадать\"\n",
" idx_2 = len(lats_2)\n",
" for i in range(idx_2):\n",
" lat_2, lon_2 = lats_2[i], lons_2[i]\n",
" if not lat_min <= lat_2 <= lat_max:\n",
" continue\n",
" if not lon_min <= lon_2 <= lon_max:\n",
" continue\n",
" for x in idx_1:\n",
" if nu_euclid_dist(lats[x], lons[x], lat_2, lon_2) <= max_dist_km:\n",
" return True\n",
" return False\n",
" \n",
" return has\n",
"\n",
"has_point = HasGoodPoint(lats, lons)\n",
"print(\"%.2f%%\" % (sum(has_point(*p, max_dist_km=3) for p in checked_points) / float(rows) * 100))"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"2 loops, best of 3: 2.13 s per loop\n",
"rows/sec: 46377\n"
]
}
],
"source": [
"t = %timeit -on 2 -r 3 sum(has_point(*p, max_dist_km=3) for p in checked_points)\n",
"print(\"rows/sec: %.f\" % ((t.repeat * t.loops * rows) / sum(t.all_runs)))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Cython 1"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1.62%\n"
]
}
],
"source": [
"def HasGoodPoint(lats, lons):\n",
" assert len(lats) == len(lons), u\"Длины списков должны совпадать\"\n",
" lat_max = max(lats) + .5\n",
" lon_max = max(lons) + .5\n",
" lat_min = min(lats) - .5\n",
" lon_min = min(lons) - .5\n",
" idx = range(len(lats))\n",
" \n",
" def has(lats_2, lons_2, max_dist_km):\n",
"\n",
" assert 0 < max_dist_km <= 10, u\"max_dist_km вне допустимого диапазона: (0, 10]\"\n",
" assert len(lats) == len(lons), u\"Длины списков должны совпадать\"\n",
" n = len(lats_2)\n",
" for i in range(n):\n",
" lat_2, lon_2 = lats_2[i], lons_2[i]\n",
" if not lat_min <= lat_2 <= lat_max:\n",
" continue\n",
" if not lon_min <= lon_2 <= lon_max:\n",
" continue\n",
" for x in idx:\n",
" if cy_euclid_dist(lats[x], lons[x], lat_2, lon_2) <= max_dist_km:\n",
" return True\n",
" return False\n",
" \n",
" return has\n",
"\n",
"has_point = HasGoodPoint(lats, lons)\n",
"print(\"%.2f%%\" % (sum(has_point(*p, max_dist_km=3) for p in checked_points) / float(rows) * 100))"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"2 loops, best of 3: 1.79 s per loop\n",
"rows/sec: 55190\n"
]
}
],
"source": [
"t = %timeit -on 2 -r 3 sum(has_point(*p, max_dist_km=3) for p in checked_points)\n",
"print(\"rows/sec: %.f\" % ((t.repeat * t.loops * rows) / sum(t.all_runs)))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Cython 2"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"%%cython\n",
"from libc.math cimport cos, pi, sqrt\n",
"from cpython cimport bool\n",
"\n",
"\n",
"cdef double R = 6371.0 # radius of the earth in km\n",
"cdef double dlat = R / 180.0 * pi # ≈ 111.19 km\n",
"cdef double deg_rd = 2 * pi / 360.\n",
"\n",
"\n",
"cdef double cyx_euclid_dist(double lat_1, double lon_1, double lat_2, double lon_2):\n",
" cdef double lat_avg_rad = (lat_2 + lat_1) *.5 * deg_rd\n",
" cdef double coeff = cos(lat_avg_rad)\n",
" cdef double x = (lon_2 - lon_1) * coeff\n",
" cdef double y = lat_2 - lat_1\n",
"\n",
" return sqrt(x*x + y*y) * dlat\n",
"\n",
"\n",
"cdef bool cyx_is_near(double lat_1, double lon_1, double lat_2, double lon_2, float max_dist_km):\n",
" return cyx_euclid_dist(lat_1, lon_1, lat_2, lon_2) <= max_dist_km\n",
" \n",
"\n",
"def HasGoodPoint(tuple lats_1, tuple lons_1):\n",
" cdef double lat_max, lon_max, lat_min, lon_min\n",
" lat_max = max(lats_1) + .5\n",
" lon_max = max(lons_1) + .5\n",
" lat_min = min(lats_1) - .5\n",
" lon_min = min(lons_1) - .5\n",
" \n",
" def has(tuple lats_2, tuple lons_2, double max_dist_km):\n",
" cdef double lat_2, lon_2\n",
" for i in xrange(len(lats_2)):\n",
" lat_2, lon_2 = lats_2[i], lons_2[i]\n",
" if not lat_min <= lat_2 <= lat_max:\n",
" continue\n",
" if not lon_min <= lon_2 <= lon_max:\n",
" continue\n",
" for x in xrange(len(lats_1)):\n",
" if cyx_is_near(lats_1[x], lons_1[x], lat_2, lon_2, max_dist_km):\n",
" return True\n",
" return False\n",
" return has"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1.62%\n"
]
}
],
"source": [
"has_point = HasGoodPoint(lats, lons)\n",
"print(\"%.2f%%\" % (sum(has_point(*p, max_dist_km=3) for p in checked_points) / float(rows) * 100))"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"2 loops, best of 3: 114 ms per loop\n",
"rows/sec: 874448\n"
]
}
],
"source": [
"t = %timeit -on 2 -r 3 sum(has_point(*p, max_dist_km=3) for p in checked_points)\n",
"print(\"rows/sec: %.f\" % ((t.repeat * t.loops * rows) / sum(t.all_runs)))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Cython 3"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"%%cython\n",
"from libc.math cimport cos, pi, sqrt\n",
"from cpython cimport bool\n",
"\n",
"\n",
"cdef double R = 6371.0 # radius of the earth in km\n",
"cdef double dlat = R / 180.0 * pi # ≈ 111.19 km\n",
"cdef double deg_rd = 2 * pi / 360.\n",
"\n",
"\n",
"cdef double cyx_euclid_dist(double lat_1, double lon_1, double lat_2, double lon_2):\n",
" cdef double lat_avg_rad = (lat_2 + lat_1) *.5 * deg_rd\n",
" cdef double coeff = cos(lat_avg_rad)\n",
" cdef double x = (lon_2 - lon_1) * coeff\n",
" cdef double y = lat_2 - lat_1\n",
"\n",
" return sqrt(x*x + y*y) * dlat\n",
"\n",
"\n",
"cdef bool cyx_is_near(double lat_1, double lon_1, double lat_2, double lon_2, float max_dist_km):\n",
" return cyx_euclid_dist(lat_1, lon_1, lat_2, lon_2) <= max_dist_km\n",
"\n",
" \n",
"cdef class cyx_HasGoodPoint:\n",
" cdef tuple lats_1, lons_1\n",
" cdef double lat_max, lon_max, lat_min, lon_min\n",
" cdef int idx_1\n",
" \n",
" def __init__(self, lats, lons):\n",
" \n",
" self.lats_1 = lats\n",
" self.lons_1 = lons\n",
" self.lat_max = max(self.lats_1) + .5\n",
" self.lon_max = max(self.lons_1) + .5\n",
" self.lat_min = min(self.lats_1) - .5\n",
" self.lon_min = min(self.lons_1) - .5\n",
" self.idx_1 = len(self.lons_1)\n",
"\n",
" \n",
" def __call__(self, tuple lats_2, tuple lons_2, double max_dist_km):\n",
" \n",
" cdef double lat_2, lon_2, lat_2_max, lon_2_max, lat_2_min, lon_2_min \n",
" \n",
" for i in xrange(len(lats_2)):\n",
" lat_2, lon_2 = lats_2[i], lons_2[i]\n",
" if not self.lat_min <= lat_2 <= self.lat_max:\n",
" continue\n",
" if not self.lon_min <= lon_2 <= self.lon_max:\n",
" continue\n",
" for x in xrange(self.idx_1):\n",
" if cyx_is_near(self.lats_1[x], self.lons_1[x], lat_2, lon_2, max_dist_km):\n",
" return True\n",
" return False"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1.62%\n"
]
}
],
"source": [
"has_point = cyx_HasGoodPoint(lats, lons)\n",
"print(\"%.2f%%\" % (sum(has_point(*p, max_dist_km=3) for p in checked_points) / float(rows) * 100))"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"2 loops, best of 3: 115 ms per loop\n",
"rows/sec: 856432\n"
]
}
],
"source": [
"t = %timeit -on 2 -r 3 sum(has_point(*p, max_dist_km=3) for p in checked_points)\n",
"print(\"rows/sec: %.f\" % ((t.repeat * t.loops * rows) / sum(t.all_runs)))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"anaconda-cloud": {},
"kernelspec": {
"display_name": "Python [default]",
"language": "python",
"name": "python2"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.12"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment