Last active
September 22, 2021 09:09
-
-
Save jiristepan/91ebb8cbeafb0c13b3a2a2daf684365a to your computer and use it in GitHub Desktop.
Load Adobe data feed data to Google BigQuery
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/bin/bash | |
| # First in https://console.cloud.google.com/ | |
| # 1) Create project | |
| # 2) Enable billing | |
| # 3) Install gcloud command line tools and run `gcloud init` to login and configure | |
| # (https://cloud.google.com/sdk/docs/downloads-apt-get) | |
| # | |
| # Created by data hackers from http://activate.cz | |
| ################## configuration ################### | |
| dataDir="/your/folder/whereAreAdobeDataExported" # folder where are data exported from Adobe using data feed export | |
| gcAccount="yourcompany.com" # google console billineg account | |
| gcProject="my-project" # google cloud project id | |
| dataSetName="analytics" # BigQueryDataset name | |
| datasetLocation="EU" # Where data sould be processed (https://cloud.google.com/bigquery/docs/dataset-locations) | |
| maxError=100 # Maximum number of errors when uploading CSV file | |
| ###################################################### | |
| bqDataset="${gcAccount}:${gcProject}:${dataSetName}" # | |
| wd=`pwd` | |
| # init dataset | |
| gcloud config set project $gcProject | |
| #create dataset | |
| bq --location=${datasetLocation} mk -d --description "adobe analytics data feed" ${dataSetName} | |
| #import lookup tables with two columns | |
| declare -a lookupTables=("browser" "browser_type" "color_depth" "connection_type" "country" "event" "javascript_version" "languages" "operating_systems" "plugins" "resolution" "search_engines") | |
| len=${#lookupTables[@]} | |
| cd $dataDir | |
| tar -xzvf *lookup_data.tar.gz | |
| for ((i=1; i<${len}; i++)); | |
| do | |
| tableName="${lookupTables[$i-1]}" | |
| echo "Importing ${tableName}" | |
| bq load --location=EU --replace --field_delimiter="\t" --source_format=CSV ${dataSetName}.${tableName} ./${tableName}.tsv id:STRING,name:STRING | |
| done | |
| # import lookup table referer type (has 3 columns) | |
| echo "Importing referrer_type" | |
| bq load --location=EU --replace --field_delimiter="\t" --source_format=CSV ${dataSetName}.referrer_type ./referrer_type.tsv id:STRING,name:STRING,type:STRING | |
| #prepare schema definition based on columns_headers | |
| cat column_headers.tsv | sed $'s/\t/:STRING,/g' | sed s/$/:STRING/ > schema.txt | |
| #import all the tables and ignore some erorrs | |
| gunzip *-*.tsv.gz | |
| for file in *-*.tsv | |
| do | |
| echo "Importing $file" | |
| bq load --location=${datasetLocation} --field_delimiter="\t" --allow_jagged_rows --ignore_unknown_values --max_bad_records=${maxError} --source_format=CSV ${dataSetName}.data ./$file `cat schema.txt` | |
| done | |
| #all done |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment