Last active
January 21, 2025 08:50
-
-
Save timrobertson100/cbcb6c681fe1cecd2ca057204b8f292a to your computer and use it in GitHub Desktop.
Data preparation from GBIF to clickhouse with H3 indexes
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# create the parquet files for the data | |
CREATE TABLE tim.niels STORED AS parquet AS | |
SELECT | |
decimalLatitude, | |
decimalLongitude, | |
datasetKey, | |
kingdomKey, | |
phylumKey, | |
classKey, | |
orderKey, | |
familyKey, | |
genusKey, | |
speciesKey, | |
taxonKey, | |
year, | |
count(*) AS occCount | |
FROM prod_h.occurrence | |
GROUP BY | |
decimalLatitude, | |
decimalLongitude, | |
datasetKey, | |
kingdomKey, | |
phylumKey, | |
classKey, | |
orderKey, | |
familyKey, | |
genusKey, | |
speciesKey, | |
taxonKey, | |
year; | |
# Mount as an external table in clickhouse | |
CREATE TABLE hdfs_niels_occurrence | |
( | |
decimallatitude Float64, | |
decimallongitude Float64, | |
datasetkey UUID, | |
kingdomkey UInt8, | |
phylumkey UInt32, | |
classkey UInt32, | |
orderkey UInt32, | |
familykey UInt32, | |
genuskey UInt32, | |
specieskey UInt32, | |
taxonkey UInt32, | |
year UInt16, | |
occcount UInt64 | |
) ENGINE = HDFS('hdfs://ha-nn/user/hive/warehouse/tim.db/niels/*', 'parquet'); | |
# create the table with h3 indexes | |
SET allow_suspicious_low_cardinality_types=1; | |
CREATE TABLE occurrence_h3 | |
( | |
mercator_x UInt32 MATERIALIZED 0xFFFFFFFF * ((decimallongitude + 180) / 360), | |
mercator_y UInt32 MATERIALIZED 0xFFFFFFFF * (1/2 - log(tan((decimallatitude + 90) / 360 * pi())) / 2 / pi()), | |
INDEX idx_x (mercator_x) TYPE minmax, | |
INDEX idx_y (mercator_y) TYPE minmax, | |
decimallatitude Float64, | |
decimallongitude Float64, | |
h3_5 UInt64 MATERIALIZED geoToH3(decimallongitude,decimallatitude,5), | |
h3_6 UInt64 MATERIALIZED geoToH3(decimallongitude,decimallatitude,6), | |
h3_7 UInt64 MATERIALIZED geoToH3(decimallongitude,decimallatitude,7), | |
h3_8 UInt64 MATERIALIZED geoToH3(decimallongitude,decimallatitude,8), | |
h3_9 UInt64 MATERIALIZED geoToH3(decimallongitude,decimallatitude,9), | |
h3_10 UInt64 MATERIALIZED geoToH3(decimallongitude,decimallatitude,10), | |
h3_11 UInt64 MATERIALIZED geoToH3(decimallongitude,decimallatitude,11), | |
h3_12 UInt64 MATERIALIZED geoToH3(decimallongitude,decimallatitude,12), | |
h3_13 UInt64 MATERIALIZED geoToH3(decimallongitude,decimallatitude,13), | |
h3_14 UInt64 MATERIALIZED geoToH3(decimallongitude,decimallatitude,14), | |
h3_15 UInt64 MATERIALIZED geoToH3(decimallongitude,decimallatitude,15), | |
datasetkey UUID, | |
kingdomkey UInt8, | |
phylumkey UInt32, | |
classkey UInt32, | |
orderkey UInt32, | |
familykey UInt32, | |
genuskey UInt32, | |
specieskey UInt32, | |
taxonkey UInt32, | |
year LowCardinality(UInt16), | |
occcount UInt64 | |
) ENGINE = MergeTree ORDER BY (mortonEncode(mercator_x, mercator_y)); | |
# populate with the Observation.org data | |
INSERT INTO occurrence_h3 | |
SELECT | |
decimallatitude, | |
decimallongitude, | |
datasetkey, | |
kingdomkey, | |
phylumkey, | |
classkey, | |
orderkey, | |
familykey, | |
genuskey, | |
specieskey, | |
taxonkey, | |
year, | |
occcount | |
FROM hdfs_niels_occurrence | |
WHERE | |
datasetkey='8a863029-f435-446a-821e-275f4f641165' | |
AND decimallatitude between -85 and 85 AND decimallongitude is not null; | |
GRANT SELECT ON default.occurrence_h3 TO tim; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment