This example will show a DwC-A into interpreted Avro files.
git clone https://github.com/gbif/pipelines.git
cd pipelines
mvn clean package -DskipTests
Pull some data:
cd pipelines/ingest-gbif-standalone
wget -O input.dwca http://ogc-act.csiro.au/ipt/archive.do?r=csiro_gbr_sbd
Create the minimal properties file allowed (no caches etc)
echo "gbif.api.url=https://api.gbif.org" > pipelines.properties
Convert the DwC-A into the equivalent verbatim Avro format
java -XX:+UseG1GC -Xms1G -Xmx4G \
-jar ./target/ingest-gbif-standalone-2.2.32-SNAPSHOT-shaded.jar \
--pipelineStep=DWCA_TO_VERBATIM \
--datasetId=3b7f6307-2f03-4f25-9f64-6aa4c3a3aea1 \
--attempt=1 \
--runner=SparkRunner \
--targetPath=./data \
--inputPath=./input.dwca \
--properties=./pipelines.properties
Convert the verbatim Avro into Avro files of interpreted data
java -XX:+UseG1GC -Xms1G -Xmx4G \
-jar ./target/ingest-gbif-standalone-2.2.32-SNAPSHOT-shaded.jar \
--pipelineStep=VERBATIM_TO_INTERPRETED \
--datasetId=3b7f6307-2f03-4f25-9f64-6aa4c3a3aea1 \
--attempt=1 \
--interpretationTypes=ALL \
--runner=SparkRunner \
--targetPath=./data \
--inputPath=./data/3b7f6307-2f03-4f25-9f64-6aa4c3a3aea1/1/verbatim.avro \
--metaFileName=verbatim-to-interpreted.yml \
--properties=./pipelines.properties \
--useExtendedRecordId=true \
--skipRegisrtyCalls=true
Have a look in ./data