Last active
November 23, 2024 00:10
-
-
Save jenningsanderson/7561fa5853d0f88d84894a06a6ee96f2 to your computer and use it in GitHub Desktop.
Foursquare DuckDB + h3 + GeoParquet
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
-- First just download it all. | |
LOAD spatial; | |
LOAD h3; | |
COPY( | |
SELECT | |
* EXCLUDE(latitude, longitude), | |
h3_latlng_to_cell(latitude, longitude, 8) AS h3, | |
ST_Point(longitude, latitude) as geometry, | |
CAST( | |
ROW(longitude, latitude, longitude, latitude) AS | |
ROW(xmin DOUBLE, ymin DOUBLE, xmax DOUBLE, ymax DOUBLE) | |
) AS bbox | |
FROM | |
read_parquet('s3://fsq-os-places-us-east-1/release/dt=2024-11-19/places/parquet/*', hive_partitioning=1) | |
) TO 'foursquare_no_order.parquet' | |
WITH (FORMAT PARQUET, COMPRESSION 'ZSTD'); | |
-- And the categories | |
COPY( | |
SELECT | |
* | |
FROM | |
read_parquet('s3://fsq-os-places-us-east-1/release/dt=2024-11-19/categories/parquet/*', hive_partitioning=1) | |
) TO 'foursquare_categories.parquet'; | |
-- Now let's restructure it a bit: | |
COPY( | |
SELECT | |
name, | |
category_name AS category, | |
CAST( ROW( | |
address, | |
locality, | |
region, | |
postcode, | |
admin_region, | |
post_town, | |
po_box, | |
country | |
) AS ROW( | |
street VARCHAR, | |
locality VARCHAR, | |
region VARCHAR, | |
postcode VARCHAR, | |
admin_region VARCHAR, | |
post_town VARCHAR, | |
po_box VARCHAR, | |
country VARCHAR | |
) | |
) AS address, | |
CAST( ROW( | |
tel, | |
website, | |
email, | |
facebook_id, | |
instagram, | |
) AS ROW( | |
tel VARCHAR, | |
website VARCHAR, | |
email VARCHAR, | |
facebook VARCHAR, | |
instagram VARCHAR, | |
twitter VARCHAR | |
) | |
) AS contact, | |
CAST( ROW( | |
a.fsq_place_id, | |
fsq_category_ids, | |
fsq_category_labels, | |
date_created, | |
date_refreshed, | |
date_closed | |
) AS ROW( | |
place_id VARCHAR, | |
category_ids VARCHAR[], | |
category_labels VARCHAR[], | |
date_created DATE, | |
date_refreshed DATE, | |
date_closed DATE | |
) | |
) AS fsq, | |
h3_h3_to_string(h3) as h3, | |
geometry, | |
bbox | |
FROM | |
'foursquare_no_order.parquet' a | |
LEFT JOIN | |
( | |
SELECT | |
fsq_place_id, | |
category_name, | |
FROM 'fsq.parquet' | |
JOIN ( | |
SELECT | |
category_name, | |
category_id | |
FROM 'foursquare_categories.parquet' | |
) ON category_id = fsq_category_ids[1] | |
WHERE fsq_category_ids IS NOT NULL | |
) b ON a.fsq_place_id = b.fsq_place_id | |
-- ORDER BY ST_Hilbert(geometry) This doesn't work as expected | |
ORDER BY h3 | |
) TO 'fsq_places_h3.parquet' (FORMAT PARQUET, COMPRESSION 'ZSTD'); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Updated to order by hilbert curve