Skip to content

Instantly share code, notes, and snippets.

@jenningsanderson
Last active November 23, 2024 00:10
Show Gist options
  • Save jenningsanderson/7561fa5853d0f88d84894a06a6ee96f2 to your computer and use it in GitHub Desktop.
Save jenningsanderson/7561fa5853d0f88d84894a06a6ee96f2 to your computer and use it in GitHub Desktop.
Foursquare DuckDB + h3 + GeoParquet
-- First just download it all.
LOAD spatial;
LOAD h3;
COPY(
SELECT
* EXCLUDE(latitude, longitude),
h3_latlng_to_cell(latitude, longitude, 8) AS h3,
ST_Point(longitude, latitude) as geometry,
CAST(
ROW(longitude, latitude, longitude, latitude) AS
ROW(xmin DOUBLE, ymin DOUBLE, xmax DOUBLE, ymax DOUBLE)
) AS bbox
FROM
read_parquet('s3://fsq-os-places-us-east-1/release/dt=2024-11-19/places/parquet/*', hive_partitioning=1)
) TO 'foursquare_no_order.parquet'
WITH (FORMAT PARQUET, COMPRESSION 'ZSTD');
-- And the categories
COPY(
SELECT
*
FROM
read_parquet('s3://fsq-os-places-us-east-1/release/dt=2024-11-19/categories/parquet/*', hive_partitioning=1)
) TO 'foursquare_categories.parquet';
-- Now let's restructure it a bit:
COPY(
SELECT
name,
category_name AS category,
CAST( ROW(
address,
locality,
region,
postcode,
admin_region,
post_town,
po_box,
country
) AS ROW(
street VARCHAR,
locality VARCHAR,
region VARCHAR,
postcode VARCHAR,
admin_region VARCHAR,
post_town VARCHAR,
po_box VARCHAR,
country VARCHAR
)
) AS address,
CAST( ROW(
tel,
website,
email,
facebook_id,
instagram,
twitter
) AS ROW(
tel VARCHAR,
website VARCHAR,
email VARCHAR,
facebook VARCHAR,
instagram VARCHAR,
twitter VARCHAR
)
) AS contact,
CAST( ROW(
a.fsq_place_id,
fsq_category_ids,
fsq_category_labels,
date_created,
date_refreshed,
date_closed
) AS ROW(
place_id VARCHAR,
category_ids VARCHAR[],
category_labels VARCHAR[],
date_created DATE,
date_refreshed DATE,
date_closed DATE
)
) AS fsq,
h3_h3_to_string(h3) as h3,
geometry,
bbox
FROM
'foursquare_no_order.parquet' a
LEFT JOIN
(
SELECT
fsq_place_id,
category_name,
FROM 'fsq.parquet'
JOIN (
SELECT
category_name,
category_id
FROM 'foursquare_categories.parquet'
) ON category_id = fsq_category_ids[1]
WHERE fsq_category_ids IS NOT NULL
) b ON a.fsq_place_id = b.fsq_place_id
-- ORDER BY ST_Hilbert(geometry) This doesn't work as expected
ORDER BY h3
) TO 'fsq_places_h3.parquet' (FORMAT PARQUET, COMPRESSION 'ZSTD');
@jenningsanderson
Copy link
Author

Updated to order by hilbert curve

@jenningsanderson
Copy link
Author

Row group bounds... not good, but better than nothing.
image

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment