Skip to content

Instantly share code, notes, and snippets.

@jiristepan
Last active September 22, 2021 09:09
Show Gist options
  • Save jiristepan/91ebb8cbeafb0c13b3a2a2daf684365a to your computer and use it in GitHub Desktop.
Save jiristepan/91ebb8cbeafb0c13b3a2a2daf684365a to your computer and use it in GitHub Desktop.
Load Adobe data feed data to Google BigQuery
#!/bin/bash
# First in https://console.cloud.google.com/
# 1) Create project
# 2) Enable billing
# 3) Install gcloud command line tools and run `gcloud init` to login and configure
# (https://cloud.google.com/sdk/docs/downloads-apt-get)
#
# Created by data hackers from http://activate.cz
################## configuration ###################
dataDir="/your/folder/whereAreAdobeDataExported" # folder where are data exported from Adobe using data feed export
gcAccount="yourcompany.com" # google console billineg account
gcProject="my-project" # google cloud project id
dataSetName="analytics" # BigQueryDataset name
datasetLocation="EU" # Where data sould be processed (https://cloud.google.com/bigquery/docs/dataset-locations)
maxError=100 # Maximum number of errors when uploading CSV file
######################################################
bqDataset="${gcAccount}:${gcProject}:${dataSetName}" #
wd=`pwd`
# init dataset
gcloud config set project $gcProject
#create dataset
bq --location=${datasetLocation} mk -d --description "adobe analytics data feed" ${dataSetName}
#import lookup tables with two columns
declare -a lookupTables=("browser" "browser_type" "color_depth" "connection_type" "country" "event" "javascript_version" "languages" "operating_systems" "plugins" "resolution" "search_engines")
len=${#lookupTables[@]}
cd $dataDir
tar -xzvf *lookup_data.tar.gz
for ((i=1; i<${len}; i++));
do
tableName="${lookupTables[$i-1]}"
echo "Importing ${tableName}"
bq load --location=EU --replace --field_delimiter="\t" --source_format=CSV ${dataSetName}.${tableName} ./${tableName}.tsv id:STRING,name:STRING
done
# import lookup table referer type (has 3 columns)
echo "Importing referrer_type"
bq load --location=EU --replace --field_delimiter="\t" --source_format=CSV ${dataSetName}.referrer_type ./referrer_type.tsv id:STRING,name:STRING,type:STRING
#prepare schema definition based on columns_headers
cat column_headers.tsv | sed $'s/\t/:STRING,/g' | sed s/$/:STRING/ > schema.txt
#import all the tables and ignore some erorrs
gunzip *-*.tsv.gz
for file in *-*.tsv
do
echo "Importing $file"
bq load --location=${datasetLocation} --field_delimiter="\t" --allow_jagged_rows --ignore_unknown_values --max_bad_records=${maxError} --source_format=CSV ${dataSetName}.data ./$file `cat schema.txt`
done
#all done
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment