bq load --source_format=AVRO dataset.table gs://mybucket/00/*.avro
avro-utils getschema ... > schema.avsc
hdfs dfs -put schema.avsc /tmp
package org.gbif.pipelines.ingest.java; | |
import org.apache.hadoop.conf.Configuration; | |
import org.apache.hadoop.hbase.HBaseConfiguration; | |
import org.apache.hadoop.hbase.TableName; | |
import org.apache.hadoop.hbase.client.Connection; | |
import org.apache.hadoop.hbase.client.ConnectionFactory; | |
import org.apache.hadoop.hbase.client.Get; | |
import org.apache.hadoop.hbase.client.HTable; | |
import org.apache.hadoop.hbase.client.Result; |
-- wget https://repository.gbif.org/repository/gbif/org/gbif/occurrence/occurrence-hive/0.187/occurrence-hive-0.187-jar-with-dependencies.jar | |
-- hdfs dfs -put occurrence-hive-0.187-jar-with-dependencies.jar /tmp | |
-- wget https://repository.gbif.org/repository/central/com/klout/brickhouse/0.6.0/brickhouse-0.6.0.jar | |
-- hdfs dfs -put brickhouse-0.6.0.jar /tmp | |
ADD JAR hdfs:///tmp/occurrence-hive-0.187-jar-with-dependencies.jar; | |
ADD JAR hdfs:///tmp/brickhouse-0.6.0.jar; | |
CREATE TEMPORARY FUNCTION toLocalISO8601 AS 'org.gbif.occurrence.hive.udf.ToLocalISO8601UDF'; | |
CREATE TEMPORARY FUNCTION joinArray AS 'brickhouse.udf.collect.JoinArrayUDF'; |
Based on ideas here, expanded to enable Hive support | |
https://www.linkedin.com/pulse/running-spark-2xx-cloudera-hadoop-distro-cdh-deenar-toraskar-cfa/ | |
wget https://archive.apache.org/dist/spark/spark-2.4.8/spark-2.4.8-bin-without-hadoop.tgz | |
tar -xvzf spark-2.4.8-bin-without-hadoop.tgz | |
cd spark-2.4.8-bin-without-hadoop | |
cp -R /etc/spark2/conf/* conf/ | |
cp /etc/hive/conf/hive-site.xml conf/ |
{ | |
"id": "https://doi.org/10.5281/zenodo.3968687", | |
"profile": "tabular-data-package", | |
"resources": [ | |
{ | |
"name": "reference-data", | |
"path": "https://zenodo.org/record/3968687/files/LBBG_ZEEBRUGGE-reference-data.csv", | |
"profile": "tabular-data-resource", | |
"schema": { | |
"fields": [ |
This is a quick test of a modified version of the Bloodhound spark script to check it runs on the GBIF Cloudera cluster (CDH 5.16.2).
From the gateway, grab the file from HDFS (skip HTTP for speed), unzip (15-20 mins) and upload to HDFS:
hdfs dfs -getmerge /occurrence-download/prod-downloads/0002504-181003121212138.zip /mnt/auto/misc/bloodhound/data.zip
unzip /mnt/auto/misc/bloodhound/data.zip -d /mnt/auto/misc/bloodhound/data
hdfs dfs -rm /tmp/verbatim.txt
hdfs dfs -rm /tmp/occurrence.txt
Using the lookup tool on c5gateway-vh.gbif.org
we can get the keys for the id 1668748136
:
12:06:39 UTC c5gateway-vh /usr/local/bin $ ./lookup-occurrence-key 1668748136
Lookup 1668748136 with dataset key from API 97bd086a-cf43-11e2-a9b3-00145eb45e9a
27:97bd086a-cf43-11e2-a9b3-00145eb45e9a|JMRC|JMRCfungicoll|JMRC:FSU:02570 / 14837 / 750|null column=o:i, timestamp=1553909664771, value=\x00\x00\x00\x00cw\x13h
73:97bd086a-cf43-11e2-a9b3-00145eb45e9a|http://id.snsb.info/ext/14837/14837/5004 column=o:i, timestamp=1563244584180, value=\x00\x00\x00\x00cw\x13h
74:97bd086a-cf43-11e2-a9b3-00145eb45e9a|http://id.snsb.info/ext/14837/14837/5005 column=o:i, timestamp=1563244586420, value=\x00\x00\x00\x00cw\x13h
75:97bd086a-cf43-11e2-a9b3-00145eb45e9a|http://id.snsb.info/ext/14837/14837/5006 column=o:i, timestamp=1553909265952, value=\x00\x00\x00\x00cw\x13h
76:97bd086a-cf43-11e2-a9b3-00145eb45e9a|http://id.snsb.info/ext/14837/14837/5007 column=o:i, timestamp=1563244589868, value=\x00\x00\x00\x00cw\x13h
{ | |
"_shards": { | |
"total": 812, | |
"successful": 812, | |
"failed": 0 | |
}, | |
"_all": { | |
"primaries": {}, | |
"total": {} | |
}, |
occurrenceCount, | |
// verbatim fields in records | |
v_kingdom, | |
v_phylum, | |
v_class, | |
v_order, | |
v_family, | |
v_genus, | |
v_scientificName, |