Skip to content

Instantly share code, notes, and snippets.

@eknowles
Created November 9, 2024 13:06
Show Gist options
  • Save eknowles/046742fa7877754a0605a90ad830b105 to your computer and use it in GitHub Desktop.
Save eknowles/046742fa7877754a0605a90ad830b105 to your computer and use it in GitHub Desktop.
overture buildings duckdb slow
# pmtiles extract https://overturemaps-tiles-us-west-2-beta.s3.amazonaws.com/2024-08-20/buildings.pmtiles data/overture_buildings.pmtiles --region=data/e00_geometry.geojson
# ogr2ogr data/overture_buildings.geojson data/overture_buildings.pmtiles -t_srs EPSG:4326 -skipfailures -nln buildings -nlt POLYGON -select @geometry_source,class,height -clipsrc data/e00_geometry.geojson
@asset(
deps=["e00_geometry"]
)
def overture_buildings(duckdb: DuckDBResource) -> Output[None]:
with duckdb.get_connection() as conn:
conn.load_extension("spatial")
bbox = conn.sql(f"""
SELECT
MIN(ST_XMin(geom)) AS min_x,
MAX(ST_XMax(geom)) AS max_x,
MIN(ST_YMin(geom)) AS min_y,
MAX(ST_YMax(geom)) AS max_y
FROM st_read('data/areas.geojson');
""").fetchall()
min_x, max_x, min_y, max_y = bbox[0]
query = f"""\
SET azure_storage_connection_string = 'DefaultEndpointsProtocol=https;AccountName=overturemapswestus2;AccountKey=;EndpointSuffix=core.windows.net';
CREATE TEMP TABLE tmp_areas AS SELECT geom FROM st_read('data/areas.geojson');
COPY(
SELECT b.geometry
FROM read_parquet('azure://release/2024-10-23.0/theme=buildings/type=building/*',
filename=true,
hive_partitioning=1) b
JOIN tmp_areas a
ON ST_Within(b.geometry, a.geom)
WHERE b.bbox.xmin BETWEEN {round(min_x, 2)} AND {round(max_x, 2)}
AND b.bbox.ymin BETWEEN {round(min_y, 2)} AND {round(max_y, 2)}
) TO 'data/overture_buildings.parquet' (FORMAT PARQUET);
DROP TABLE tmp_areas;
"""
conn.execute(query)
conn.execute("""
COPY (
SELECT geometry
FROM 'data/overture_buildings.parquet'
) TO 'data/overture_buildings.geojson' WITH (FORMAT GDAL, DRIVER 'GeoJSON')
""")
count = conn.sql("select count(*) from 'data/overture_buildings.parquet'").fetchall()[0][0]
return Output(
None,
metadata={
'file': MetadataValue.text("data/overture_buildings.geojson"),
'total_rows': IntMetadataValue(count)
}
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment