Skip to content

Instantly share code, notes, and snippets.

@timrobertson100
Last active January 21, 2025 08:50
Show Gist options
  • Save timrobertson100/cbcb6c681fe1cecd2ca057204b8f292a to your computer and use it in GitHub Desktop.
Save timrobertson100/cbcb6c681fe1cecd2ca057204b8f292a to your computer and use it in GitHub Desktop.
Data preparation from GBIF to clickhouse with H3 indexes
# create the parquet files for the data
CREATE TABLE tim.niels STORED AS parquet AS
SELECT
decimalLatitude,
decimalLongitude,
datasetKey,
kingdomKey,
phylumKey,
classKey,
orderKey,
familyKey,
genusKey,
speciesKey,
taxonKey,
year,
count(*) AS occCount
FROM prod_h.occurrence
GROUP BY
decimalLatitude,
decimalLongitude,
datasetKey,
kingdomKey,
phylumKey,
classKey,
orderKey,
familyKey,
genusKey,
speciesKey,
taxonKey,
year;
# Mount as an external table in clickhouse
CREATE TABLE hdfs_niels_occurrence
(
decimallatitude Float64,
decimallongitude Float64,
datasetkey UUID,
kingdomkey UInt8,
phylumkey UInt32,
classkey UInt32,
orderkey UInt32,
familykey UInt32,
genuskey UInt32,
specieskey UInt32,
taxonkey UInt32,
year UInt16,
occcount UInt64
) ENGINE = HDFS('hdfs://ha-nn/user/hive/warehouse/tim.db/niels/*', 'parquet');
# create the table with h3 indexes
SET allow_suspicious_low_cardinality_types=1;
CREATE TABLE occurrence_h3
(
mercator_x UInt32 MATERIALIZED 0xFFFFFFFF * ((decimallongitude + 180) / 360),
mercator_y UInt32 MATERIALIZED 0xFFFFFFFF * (1/2 - log(tan((decimallatitude + 90) / 360 * pi())) / 2 / pi()),
INDEX idx_x (mercator_x) TYPE minmax,
INDEX idx_y (mercator_y) TYPE minmax,
decimallatitude Float64,
decimallongitude Float64,
h3_5 UInt64 MATERIALIZED geoToH3(decimallongitude,decimallatitude,5),
h3_6 UInt64 MATERIALIZED geoToH3(decimallongitude,decimallatitude,6),
h3_7 UInt64 MATERIALIZED geoToH3(decimallongitude,decimallatitude,7),
h3_8 UInt64 MATERIALIZED geoToH3(decimallongitude,decimallatitude,8),
h3_9 UInt64 MATERIALIZED geoToH3(decimallongitude,decimallatitude,9),
h3_10 UInt64 MATERIALIZED geoToH3(decimallongitude,decimallatitude,10),
h3_11 UInt64 MATERIALIZED geoToH3(decimallongitude,decimallatitude,11),
h3_12 UInt64 MATERIALIZED geoToH3(decimallongitude,decimallatitude,12),
h3_13 UInt64 MATERIALIZED geoToH3(decimallongitude,decimallatitude,13),
h3_14 UInt64 MATERIALIZED geoToH3(decimallongitude,decimallatitude,14),
h3_15 UInt64 MATERIALIZED geoToH3(decimallongitude,decimallatitude,15),
datasetkey UUID,
kingdomkey UInt8,
phylumkey UInt32,
classkey UInt32,
orderkey UInt32,
familykey UInt32,
genuskey UInt32,
specieskey UInt32,
taxonkey UInt32,
year LowCardinality(UInt16),
occcount UInt64
) ENGINE = MergeTree ORDER BY (mortonEncode(mercator_x, mercator_y));
# populate with the Observation.org data
INSERT INTO occurrence_h3
SELECT
decimallatitude,
decimallongitude,
datasetkey,
kingdomkey,
phylumkey,
classkey,
orderkey,
familykey,
genuskey,
specieskey,
taxonkey,
year,
occcount
FROM hdfs_niels_occurrence
WHERE
datasetkey='8a863029-f435-446a-821e-275f4f641165'
AND decimallatitude between -85 and 85 AND decimallongitude is not null;
GRANT SELECT ON default.occurrence_h3 TO tim;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment