Last active
February 7, 2024 04:58
-
-
Save answerquest/3cfc412cdda76efd02bda5574c252459 to your computer and use it in GitHub Desktop.
Extract data from a vector tile layer for a given area
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Program by Nikhil VJ, https://nikhilvj.co.in | |
Based on solution worked out at https://gis.stackexchange.com/questions/475398/extract-features-in-lat-long-co-ordinates-from-vector-tile-layer-pbf-file-pytho | |
License: Creative Commons Zero v1.0 Universal, ref: https://choosealicense.com/licenses/cc0-1.0/ | |
What this does: | |
- Given any vector tile layer in .pbf format, and a poylgon or similar shape, | |
- Extracts all the data from the vector tile layer falling over that shape, | |
- And save it to local disk as a .gpkg shapefile which you can further use | |
- If the vector tile layer had multiple layers, then .gpkg will have all those layers | |
Advantages: | |
- You can fetch data from sources where just vector tile layer is available, original data is not. | |
- Can take just what you need and avoid having to download and process the huge source data | |
- This becomes static local data, so you can work with this offline, edit it, do operations on it etc | |
Install reqd libs : | |
pip install mapbox-vector-tile mercantile geopandas shapely requests | |
Trivia: | |
The part where shape is converted to tile ranges and each tile is downloaded, was got from this chatgpt prompt, | |
but I had to flip the Y part to make it work: | |
"given a vector tile url like `https://indianopenmaps.fly.dev/google-buildings/{z}/{x}/{y}.pbf` and a polygon, | |
give python code to fetch all vector data from all tiles that are within or intersecting with this polygon, | |
at a specified zoom level." | |
And I had to search further online for how to convert the downloaded binary tile data to vector; | |
chatgpt went down full protocol buffer type rabbit holes for it which didn't work. | |
Disclaimer / Warning: | |
This program is made for SMALL REGIONS use cases, | |
where you just want data for a small target area like a city or district or so. | |
Do NOT try to download entire countries or continents using this; | |
your computer will probably hang up from all the data being collected, | |
and your IP might get blacklisted by the tile layer's publisher too. | |
If you MUST do this with large number of tiles, then ensure you keep a pause between batches, | |
and run the program overnight so that you don't spam the publisher. You have been warned. | |
Warning for Teachers: | |
Do NOT give this as an assignment to all your students to do at the same time; | |
instead, modify the program to download the tiles and SAVE them to a local folder, then share that folder with everyone. | |
Please add comment below if you need help. | |
""" | |
# fill all vital info here | |
tile_url = "https://indianopenmaps.fly.dev/google-buildings/{z}/{x}/{y}.pbf" # google buildings data, from https://github.com/ramSeraph/google_buildings_india | |
pfile = "shape1.geojson" | |
zoom_level = 14 # vector tiles, they typically don't go more than this. Use lower if you want lower-res data | |
output_filename = "output" # don't put extension, we're sticking to .gpkg only | |
MAX_ATTEMPTS = 3 | |
TILES_LIMIT=0 # set to a positive integer to set global upper limit on number of tiles to fetch, even if the shape needs more tiles | |
################## | |
import json | |
import time | |
import math | |
import requests | |
from shapely.geometry import shape | |
import mercantile | |
import mapbox_vector_tile | |
import geopandas as gpd | |
##################### | |
# FUNCTIONS | |
def pixel2deg(xtile, ytile, zoom, xpixel, ypixel, extent = 4096): | |
# from https://gis.stackexchange.com/a/460173/44746 | |
n = 2.0 ** zoom | |
xtile = xtile + (xpixel / extent) | |
ytile = ytile + ((extent - ypixel) / extent) | |
lon_deg = (xtile / n) * 360.0 - 180.0 | |
lat_rad = math.atan(math.sinh(math.pi * (1 - 2 * ytile / n))) | |
lat_deg = math.degrees(lat_rad) | |
return (lon_deg, lat_deg) | |
def fetch_file(url): | |
global MAX_ATTEMPTS | |
attempts = 0 | |
while True: | |
response = requests.get(url) | |
attempts += 1 | |
if response.status_code == 200: | |
return response.content | |
else: | |
if attempts > MAX_ATTEMPTS: | |
print(f"Max attempts to fetch {url} crossed, giving up.") | |
return False | |
print(f"Failed to fetch {url}, status: {response.status_code} trying after 10secs cooloff") | |
time.sleep(10) | |
##################### | |
# MAIN PROGRAM START | |
with open(pfile ,'r') as f: | |
poly1 = json.loads(f.read()) | |
polygon = poly1['features'][0]['geometry'] | |
shapely_polygon = shape(polygon) | |
bounding_box = shapely_polygon.bounds | |
print(f"{bounding_box=}") | |
# Calculate the tiles covering the bounding box at the given zoom level | |
SWTile = mercantile.tile(bounding_box[0], bounding_box[1], zoom_level) | |
NETile = mercantile.tile(bounding_box[2], bounding_box[3], zoom_level) | |
print(f"{SWTile=} {NETile=}") | |
# get the tile ranges, x and y | |
min_x = SWTile.x | |
max_x = NETile.x | |
min_y = NETile.y | |
max_y = SWTile.y | |
# Note: FLIPPING for y, as in tiles system y goes from north to south, wherease latitude goes from south to north | |
total_tiles = (max_x - min_x + 1) * (max_y - min_y + 1) | |
print(f"X ranging from {min_x}->{max_x}, Y ranging from {min_y}->{max_y}, total tiles: {total_tiles}") | |
# Iterate through each tile and fetch its vector data | |
feature_collector = {} | |
tilecounter = 0 | |
breakFlag = False | |
for tile_x in range(min_x, max_x + 1): | |
for tile_y in range(min_y, max_y + 1): | |
if TILES_LIMIT: | |
if tilecounter >= TILES_LIMIT: | |
breakFlag = True | |
break | |
tile_url_with_coords = tile_url.format(z=zoom_level, x=tile_x, y=tile_y) | |
pbf_data = fetch_file(tile_url_with_coords) | |
if not pbf_data: | |
print(f"Failed to fetch {tile_url_with_coords}, skipping") | |
continue | |
else: | |
tilecounter += 1 | |
if total_tiles >= 100 and tilecounter % 10 == 0: | |
time.sleep(2) | |
# give the server a breather! | |
vector_data = mapbox_vector_tile.decode(tile=pbf_data, | |
transformer = lambda x, y: pixel2deg(tile_x, tile_y, zoom_level, x, y) | |
) | |
# if you're unsure which all layers are avaialable in the tile, uncomment the line below and run once | |
# print(list(vector_data.keys()) | |
this_tile_count = 0 | |
layer_count = 0 | |
for layer_name in vector_data.keys(): | |
features = vector_data.get(layer_name,{}).get('features',[]) | |
if(len(features)): | |
this_tile_count += len(features) | |
layer_count += 1 | |
if not feature_collector.get(layer_name,False): | |
feature_collector[layer_name] = features | |
else: | |
feature_collector[layer_name] += features | |
print(f"{tile_url_with_coords}: {round(len(pbf_data)/(2**10),2)} KB, {this_tile_count} features from {layer_count} layers") | |
if breakFlag: | |
break | |
print(f"Total {tilecounter} tiles processed, total layers: {len(feature_collector.keys())}") | |
# Save to local file | |
for lN, layer in enumerate(list(feature_collector.keys())): | |
print(f"Layer {layer} has {len(feature_collector[layer])} features") | |
gdf = gpd.GeoDataFrame.from_features(feature_collector[layer]) | |
if lN == 0: | |
gdf.to_file(f"{output_filename}.gpkg", layer=layer, driver='GPKG') | |
else: | |
gdf.to_file(f"{output_filename}.gpkg", layer=layer, driver='GPKG', append=True) | |
# geopandas' to_file command, supports a range of formats like .gpkg, .geojson etc. | |
# Ref: https://geopandas.org/en/stable/docs/reference/api/geopandas.GeoDataFrame.to_file.html | |
print("Ok we're done") |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"type": "FeatureCollection", | |
"crs": { "type": "name", "properties": { "name": "urn:ogc:def:crs:OGC:1.3:CRS84" } }, | |
"features": [ | |
{ "type": "Feature", "properties": { }, | |
"geometry": { "type": "MultiPolygon", "coordinates": [ [ [ [ 77.923011315970342, 30.344647089972636 ], [ 77.931489883119454, 30.323216324155737 ], [ 77.931035674165045, 30.312891293764995 ], [ 77.944359136827927, 30.305571625932046 ], [ 77.943904927873518, 30.300735116946182 ], [ 77.95798540546042, 30.294395007336927 ], [ 77.959348032323661, 30.289950150014867 ], [ 78.009311017309457, 30.259222835620964 ], [ 78.033838300847933, 30.244411450884343 ], [ 78.043225285905891, 30.258536264499419 ], [ 78.058819793340845, 30.267559387655684 ], [ 78.061545047067327, 30.263374853926937 ], [ 78.069720808246842, 30.261805607801818 ], [ 78.080394718675592, 30.268736255633883 ], [ 78.097806061928239, 30.258405488503509 ], [ 78.092809763429642, 30.267167095194619 ], [ 78.108101464895, 30.28181161720158 ], [ 78.107950061910216, 30.293970139976565 ], [ 78.119002479801011, 30.296780766788185 ], [ 78.123695972329969, 30.30004883620677 ], [ 78.11430898727204, 30.303447512855989 ], [ 78.107344449970981, 30.315995452092924 ], [ 78.105679017138129, 30.342915815085462 ], [ 78.106890241016572, 30.364178974902018 ], [ 78.106890241016572, 30.364178974902018 ], [ 78.110069703697505, 30.373584243855866 ], [ 78.102348151472384, 30.388735274034641 ], [ 78.098714479837071, 30.401272281127877 ], [ 78.046026241124764, 30.402708710432179 ], [ 78.049811315744932, 30.391869676661067 ], [ 78.025435435191241, 30.368881722463353 ], [ 78.011090002380925, 30.363885045670393 ], [ 77.957796151729411, 30.358136918708745 ], [ 77.954162480094084, 30.339322505324549 ], [ 77.940384808476793, 30.34023717850539 ], [ 77.923124868208959, 30.344679755242197 ], [ 77.923011315970342, 30.344647089972636 ] ] ] ] } } | |
] | |
} |
2024-02-07 : Updated to grab all layers present in the tile, and you don't need to know the layer name in advance.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Great