Tim Robertson timrobertson100

bq load --source_format=AVRO dataset.table gs://mybucket/00/*.avro

did a small download to get the schema

avro-utils getschema ... > schema.avsc

hdfs dfs -put schema.avsc /tmp

This is a quick test of a modified version of the Bloodhound spark script to check it runs on the GBIF Cloudera cluster (CDH 5.16.2).

From the gateway, grab the file from HDFS (skip HTTP for speed), unzip (15-20 mins) and upload to HDFS:

hdfs dfs -getmerge /occurrence-download/prod-downloads/0002504-181003121212138.zip /mnt/auto/misc/bloodhound/data.zip
unzip /mnt/auto/misc/bloodhound/data.zip -d /mnt/auto/misc/bloodhound/data

hdfs dfs -rm /tmp/verbatim.txt
hdfs dfs -rm /tmp/occurrence.txt

Example running standalone GBIF pipelines

This example will show a DwC-A into interpreted Avro files.

git clone https://github.com/gbif/pipelines.git
cd pipelines
mvn clean package -DskipTests

Using the lookup tool on c5gateway-vh.gbif.org we can get the keys for the id 1668748136:

12:06:39 UTC c5gateway-vh /usr/local/bin $ ./lookup-occurrence-key 1668748136
Lookup 1668748136 with dataset key from API 97bd086a-cf43-11e2-a9b3-00145eb45e9a
 27:97bd086a-cf43-11e2-a9b3-00145eb45e9a|JMRC|JMRCfungicoll|JMRC:FSU:02570 / 14837 / 750|null column=o:i, timestamp=1553909664771, value=\x00\x00\x00\x00cw\x13h
 73:97bd086a-cf43-11e2-a9b3-00145eb45e9a|http://id.snsb.info/ext/14837/14837/5004 column=o:i, timestamp=1563244584180, value=\x00\x00\x00\x00cw\x13h
 74:97bd086a-cf43-11e2-a9b3-00145eb45e9a|http://id.snsb.info/ext/14837/14837/5005 column=o:i, timestamp=1563244586420, value=\x00\x00\x00\x00cw\x13h
 75:97bd086a-cf43-11e2-a9b3-00145eb45e9a|http://id.snsb.info/ext/14837/14837/5006 column=o:i, timestamp=1553909265952, value=\x00\x00\x00\x00cw\x13h
 76:97bd086a-cf43-11e2-a9b3-00145eb45e9a|http://id.snsb.info/ext/14837/14837/5007 column=o:i, timestamp=1563244589868, value=\x00\x00\x00\x00cw\x13h


	# create the parquet files for the data
	CREATE TABLE tim.niels STORED AS parquet AS
	SELECT
	decimalLatitude,
	decimalLongitude,
	datasetKey,
	kingdomKey,
	phylumKey,
	classKey,

	package org.gbif.pipelines.ingest.java;

	import org.apache.hadoop.conf.Configuration;
	import org.apache.hadoop.hbase.HBaseConfiguration;
	import org.apache.hadoop.hbase.TableName;
	import org.apache.hadoop.hbase.client.Connection;
	import org.apache.hadoop.hbase.client.ConnectionFactory;
	import org.apache.hadoop.hbase.client.Get;
	import org.apache.hadoop.hbase.client.HTable;
	import org.apache.hadoop.hbase.client.Result;

	-- wget https://repository.gbif.org/repository/gbif/org/gbif/occurrence/occurrence-hive/0.187/occurrence-hive-0.187-jar-with-dependencies.jar
	-- hdfs dfs -put occurrence-hive-0.187-jar-with-dependencies.jar /tmp
	-- wget https://repository.gbif.org/repository/central/com/klout/brickhouse/0.6.0/brickhouse-0.6.0.jar
	-- hdfs dfs -put brickhouse-0.6.0.jar /tmp

	ADD JAR hdfs:///tmp/occurrence-hive-0.187-jar-with-dependencies.jar;
	ADD JAR hdfs:///tmp/brickhouse-0.6.0.jar;

	CREATE TEMPORARY FUNCTION toLocalISO8601 AS 'org.gbif.occurrence.hive.udf.ToLocalISO8601UDF';
	CREATE TEMPORARY FUNCTION joinArray AS 'brickhouse.udf.collect.JoinArrayUDF';

	Based on ideas here, expanded to enable Hive support
	https://www.linkedin.com/pulse/running-spark-2xx-cloudera-hadoop-distro-cdh-deenar-toraskar-cfa/

	wget https://archive.apache.org/dist/spark/spark-2.4.8/spark-2.4.8-bin-without-hadoop.tgz

	tar -xvzf spark-2.4.8-bin-without-hadoop.tgz
	cd spark-2.4.8-bin-without-hadoop

	cp -R /etc/spark2/conf/* conf/
	cp /etc/hive/conf/hive-site.xml conf/

	{
	"id": "https://doi.org/10.5281/zenodo.3968687",
	"profile": "tabular-data-package",
	"resources": [
	{
	"name": "reference-data",
	"path": "https://zenodo.org/record/3968687/files/LBBG_ZEEBRUGGE-reference-data.csv",
	"profile": "tabular-data-resource",
	"schema": {
	"fields": [

	{
	"_shards": {
	"total": 812,
	"successful": 812,
	"failed": 0
	},
	"_all": {
	"primaries": {},
	"total": {}
	},