jiristepan · September 22, 2021 09:09
diff --git a/loadDatafeedToBigQuery.sh b/loadDatafeedToBigQuery.sh
 #!/bin/bash

 # First in https://console.cloud.google.com/
 # 1) Create project
 # 2) Enable billing
 # 3) Install gcloud command line tools and run `gcloud init` to login and configure
 #    (https://cloud.google.com/sdk/docs/downloads-apt-get)
 #
 # Created by data hackers from http://activate.cz

 ##################  configuration  ###################
 dataDir="/your/folder/whereAreAdobeDataExported"     # folder where are data exported from Adobe using data feed export
 gcAccount="yourcompany.com"                          # google console billineg account
 gcProject="my-project"                               # google cloud project id
 dataSetName="analytics"                              # BigQueryDataset name
 datasetLocation="EU"                                 # Where data sould be processed (https://cloud.google.com/bigquery/docs/dataset-locations)
 maxError=100                                         # Maximum number of errors when uploading CSV file
 ######################################################

 bqDataset="${gcAccount}:${gcProject}:${dataSetName}" #
 wd=`pwd`

 # init dataset
 gcloud config set project $gcProject

 #create dataset
 bq --location=${datasetLocation} mk -d --description "adobe analytics data feed" ${dataSetName}


 #import lookup tables with two columns
 declare -a lookupTables=("browser" "browser_type" "color_depth" "connection_type" "country" "event" "javascript_version" "languages" "operating_systems" "plugins" "resolution" "search_engines")
 len=${#lookupTables[@]}

 cd $dataDir
 tar -xzvf *lookup_data.tar.gz

 for ((i=1; i<${len}; i++));
 do
    tableName="${lookupTables[$i-1]}"
    echo "Importing ${tableName}"
    bq load --location=EU --replace --field_delimiter="\t" --source_format=CSV ${dataSetName}.${tableName} ./${tableName}.tsv id:STRING,name:STRING
 done

 # import lookup table referer type (has 3 columns)
 echo "Importing referrer_type"
 bq load --location=EU --replace --field_delimiter="\t" --source_format=CSV ${dataSetName}.referrer_type ./referrer_type.tsv id:STRING,name:STRING,type:STRING


 #prepare schema definition based on columns_headers
 cat column_headers.tsv | sed $'s/\t/:STRING,/g' | sed s/$/:STRING/ > schema.txt

 #import all the tables and ignore some erorrs
 gunzip *-*.tsv.gz

 for file in *-*.tsv
 do
    echo "Importing $file"
    bq load --location=${datasetLocation} --field_delimiter="\t" --allow_jagged_rows --ignore_unknown_values --max_bad_records=${maxError} --source_format=CSV ${dataSetName}.data ./$file `cat schema.txt`
 done

 #all done
	#!/bin/bash

	# First in https://console.cloud.google.com/
	# 1) Create project
	# 2) Enable billing
	# 3) Install gcloud command line tools and run `gcloud init` to login and configure
	# (https://cloud.google.com/sdk/docs/downloads-apt-get)
	#
	# Created by data hackers from http://activate.cz

	################## configuration ###################
	dataDir="/your/folder/whereAreAdobeDataExported" # folder where are data exported from Adobe using data feed export
	gcAccount="yourcompany.com" # google console billineg account
	gcProject="my-project" # google cloud project id
	dataSetName="analytics" # BigQueryDataset name
	datasetLocation="EU" # Where data sould be processed (https://cloud.google.com/bigquery/docs/dataset-locations)
	maxError=100 # Maximum number of errors when uploading CSV file
	######################################################

	bqDataset="${gcAccount}:${gcProject}:${dataSetName}" #
	wd=`pwd`

	# init dataset
	gcloud config set project $gcProject

	#create dataset
	bq --location=${datasetLocation} mk -d --description "adobe analytics data feed" ${dataSetName}


	#import lookup tables with two columns
	declare -a lookupTables=("browser" "browser_type" "color_depth" "connection_type" "country" "event" "javascript_version" "languages" "operating_systems" "plugins" "resolution" "search_engines")
	len=${#lookupTables[@]}

	cd $dataDir
	tar -xzvf *lookup_data.tar.gz

	for ((i=1; i<${len}; i++));
	do
	tableName="${lookupTables[$i-1]}"
	echo "Importing ${tableName}"
	bq load --location=EU --replace --field_delimiter="\t" --source_format=CSV ${dataSetName}.${tableName} ./${tableName}.tsv id:STRING,name:STRING
	done

	# import lookup table referer type (has 3 columns)
	echo "Importing referrer_type"
	bq load --location=EU --replace --field_delimiter="\t" --source_format=CSV ${dataSetName}.referrer_type ./referrer_type.tsv id:STRING,name:STRING,type:STRING


	#prepare schema definition based on columns_headers
	cat column_headers.tsv \| sed $'s/\t/:STRING,/g' \| sed s/$/:STRING/ > schema.txt

	#import all the tables and ignore some erorrs
	gunzip -.tsv.gz

	for file in -.tsv
	do
	echo "Importing $file"
	bq load --location=${datasetLocation} --field_delimiter="\t" --allow_jagged_rows --ignore_unknown_values --max_bad_records=${maxError} --source_format=CSV ${dataSetName}.data ./$file `cat schema.txt`
	done

	#all done