bq load --source_format=AVRO dataset.table gs://mybucket/00/*.avro
avro-utils getschema ... > schema.avsc
hdfs dfs -put schema.avsc /tmp
# create the parquet files for the data | |
CREATE TABLE tim.niels STORED AS parquet AS | |
decimalLatitude, | |
decimalLongitude, | |
datasetKey, | |
kingdomKey, | |
phylumKey, | |
classKey, |
package; | |
import org.apache.hadoop.conf.Configuration; | |
import org.apache.hadoop.hbase.HBaseConfiguration; | |
import org.apache.hadoop.hbase.TableName; | |
import org.apache.hadoop.hbase.client.Connection; | |
import org.apache.hadoop.hbase.client.ConnectionFactory; | |
import org.apache.hadoop.hbase.client.Get; | |
import org.apache.hadoop.hbase.client.HTable; | |
import org.apache.hadoop.hbase.client.Result; |
-- wget | |
-- hdfs dfs -put occurrence-hive-0.187-jar-with-dependencies.jar /tmp | |
-- wget | |
-- hdfs dfs -put brickhouse-0.6.0.jar /tmp | |
ADD JAR hdfs:///tmp/occurrence-hive-0.187-jar-with-dependencies.jar; | |
ADD JAR hdfs:///tmp/brickhouse-0.6.0.jar; | |
CREATE TEMPORARY FUNCTION toLocalISO8601 AS 'org.gbif.occurrence.hive.udf.ToLocalISO8601UDF'; | |
CREATE TEMPORARY FUNCTION joinArray AS 'brickhouse.udf.collect.JoinArrayUDF'; |
Based on ideas here, expanded to enable Hive support | | | |
wget | |
tar -xvzf spark-2.4.8-bin-without-hadoop.tgz | |
cd spark-2.4.8-bin-without-hadoop | |
cp -R /etc/spark2/conf/* conf/ | |
cp /etc/hive/conf/hive-site.xml conf/ |
{ | |
"id": "", | |
"profile": "tabular-data-package", | |
"resources": [ | |
{ | |
"name": "reference-data", | |
"path": "", | |
"profile": "tabular-data-resource", | |
"schema": { | |
"fields": [ |
This is a quick test of a modified version of the Bloodhound spark script to check it runs on the GBIF Cloudera cluster (CDH 5.16.2).
From the gateway, grab the file from HDFS (skip HTTP for speed), unzip (15-20 mins) and upload to HDFS:
hdfs dfs -getmerge /occurrence-download/prod-downloads/ /mnt/auto/misc/bloodhound/
unzip /mnt/auto/misc/bloodhound/ -d /mnt/auto/misc/bloodhound/data
hdfs dfs -rm /tmp/verbatim.txt
hdfs dfs -rm /tmp/occurrence.txt
Using the lookup tool on
we can get the keys for the id 1668748136
12:06:39 UTC c5gateway-vh /usr/local/bin $ ./lookup-occurrence-key 1668748136
Lookup 1668748136 with dataset key from API 97bd086a-cf43-11e2-a9b3-00145eb45e9a
27:97bd086a-cf43-11e2-a9b3-00145eb45e9a|JMRC|JMRCfungicoll|JMRC:FSU:02570 / 14837 / 750|null column=o:i, timestamp=1553909664771, value=\x00\x00\x00\x00cw\x13h
73:97bd086a-cf43-11e2-a9b3-00145eb45e9a| column=o:i, timestamp=1563244584180, value=\x00\x00\x00\x00cw\x13h
74:97bd086a-cf43-11e2-a9b3-00145eb45e9a| column=o:i, timestamp=1563244586420, value=\x00\x00\x00\x00cw\x13h
75:97bd086a-cf43-11e2-a9b3-00145eb45e9a| column=o:i, timestamp=1553909265952, value=\x00\x00\x00\x00cw\x13h
76:97bd086a-cf43-11e2-a9b3-00145eb45e9a| column=o:i, timestamp=1563244589868, value=\x00\x00\x00\x00cw\x13h
{ | |
"_shards": { | |
"total": 812, | |
"successful": 812, | |
"failed": 0 | |
}, | |
"_all": { | |
"primaries": {}, | |
"total": {} | |
}, |