apenney · October 19, 2018 21:09
diff --git a/dependencies.sh b/dependencies.sh
 #!/usr/bin/env bash

 # install google utilities
 wget http://commondatastorage.googleapis.com/pub/gsutil.tar.gz
 tar xfz gsutil.tar.gz -C $HOME
 vi ~/.bashrc
 export PATH=${PATH}:$HOME/gsutil
 cd gsutil
 python setup.py install
 gsutil config

 sudo sh
 easy_install bigquery
 bq init
diff --git a/mysql_to_big_query.sh b/mysql_to_big_query.sh
 #!/bin/bash

 set -x

 # Cleanup the mess we've made
 function cleanup_files() {
  rm -f "${TABLE_NAME}.*"
  rm -f "${file_prefix}*"
 }

 TABLE_SCHEMA=$1
 TABLE_NAME=$2
 BIGQUERY_DATASET=$3

 # Customize these for your organization
 BUCKET_PREFIX="eq"
 GZIP_ENABLED="false"

 mytime=$(date '+%y%m%d%H%M')
 #hostname=$(hostname | tr '[:upper:]' '[:lower:]')
 file_prefix="${TABLE_NAME}${mytime}${TABLE_SCHEMA}"
 bucket_name="${BUCKET_PREFIX}-${file_prefix}"
 splitat="4000000000"
 bulkfiles=200
 maxbad=300

 if [ $# -ne 3 ];then
  echo 'ERROR, must provide two arguments: ./mysql2bq.sh TABLE_SCHEMA TABLE_NAME'
  exit 1
 fi

 # Check if the table has blob data
 cat > blob_query.txt << heredoc
 select sum(IF((DATA_TYPE LIKE '%blob%'),1, 0)) from INFORMATION_SCHEMA.columns where TABLE_SCHEMA = '$TABLE_SCHEMA' AND TABLE_NAME = '$TABLE_NAME'
 heredoc

 if ! mysql -Bs < blob_query.txt; then
  echo "blob column found in table $TABLE_NAME"
  exit 3
 fi

 if ! gsutil mb "gs://$bucket_name"; then
  cleanup_files
  echo "bucket $bucket_name could not be created in cloud"
  exit 4
 fi

 # create JSON schema from mysql table structure
 cat > json_query.txt << heredoc
 select CONCAT('{"name": "', COLUMN_NAME, '","type":"', IF(DATA_TYPE like "%date%", "DATE",IF(DATA_TYPE like "%int%", "INTEGER",IF(DATA_TYPE = "decimal","FLOAT","STRING"))) , '"},') as json from information_schema.columns where TABLE_SCHEMA = '$TABLE_SCHEMA' AND TABLE_NAME = '$TABLE_NAME';
 heredoc

 echo '[' > "$TABLE_NAME.json"
 mysql -Bs < json_query.txt | sed '$s/,$//' >> "$TABLE_NAME.json"
 mysql "$TABLE_SCHEMA" -Bse"show create table $TABLE_NAME\\G" > "$TABLE_NAME.sql"
 echo ', {"name": "hostname","type":"STRING"} ]' >> "$TABLE_NAME.json"

 # copy json and create table data to cloud
 gsutil cp "$TABLE_NAME.json" "gs://$bucket_name/"
 gsutil cp "$TABLE_NAME.sql" "gs://$bucket_name/"

 # Dump the data, strip \r's, a bunch of sed's, and we're done
 time mysql --quick "$TABLE_SCHEMA" -Bse"select *  from $TABLE_NAME" \
  | sed -e "s/\(\\r\|\"\|NULL\|(Ctrl-v)(Ctrl-m)\)//g;s/$/\\t$TABLE_SCHEMA/" > "$TABLE_NAME.txt"

 # split files with prefix
 time split -C "$splitat" "$TABLE_NAME.txt" "$file_prefix"

 GZIP_FLAGS="-m"
 if [[ "$GZIP_ENABLED" == "true" ]]; then
  GZIP_FLAGS="-m -J"
 fi

 if ! gsutil "$GZIP_FLAGS" cp "$file_prefix"* "gs://$bucket_name/"; then
  cleanup_files
  echo "$file could not be copied to cloud"
  exit 3
 fi

 ##
 ## Import to BigQuery
 ##

 for mylist in $(gsutil ls "gs://$bucket_name/*" | xargs -n$bulkfiles | tr ' ', ','); do
  echo "$mylist"

  ##
  ## Check if we have a DATE column, if we do then we can use partitioning
  ##
  BQ_ARGS="--nosync -F \t --job_id=$TABLE_NAME --max_bad_record=$maxbad"

  if grep "DATE" "$TABLE_NAME.json"; then
    FIELD=$(grep "DATE" "$TABLE_NAME.json" | tail -1 | cut -d'"' -f4)
    BQ_ARGS+=" --time_partitioning_type=DAY --time_partitioning_field=${FIELD}"
  fi

  if ! bq load $BQ_ARGS "${BIGQUERY_DATASET}.${TABLE_NAME}" "$mylist" "${TABLE_NAME}.json"; then
    echo "bq load failed for $file, check file exist in cloud"
    cleanup_files
    exit 2
  fi
 done
diff --git a/wrapper.sh b/wrapper.sh
 #!/bin/bash

 BIGQUERY_DATASET=$1
 TABLE_SCHEMA=$2
 LOG_TO_FILE="false"

 if [[ $# -eq 0 ]] ; then
    echo 'ERROR, must provide two arguments: ./wrapper.sh BIGQUERY_DATASET TABLE_SCHEMA'
    exit 0
 fi

 bq mk "$BIGQUERY_DATASET" || true

 # mysqlshow has a structure with a header at the top, we strip this out with a tail
 # and then use awk to get the contents of the table cell instead of the borders
 for tbl_name in $(mysqlshow "$TABLE_SCHEMA" | tail -n +5 | awk '{print $2}'); do
 #sh -xv mysql2bq.sh $TABLE_SCHEMA $tbl_name > script_succ.txt 2> script_err.txt
  if [[ "$LOG_TO_FILE" == "true" ]]; then
    ./mysql2bq.sh "$TABLE_SCHEMA" "$tbl_name" "$BIGQUERY_DATASET" > script_success.txt 2> script_error.txt
  else
    ./mysql2bq.sh "$TABLE_SCHEMA" "$tbl_name" "$BIGQUERY_DATASET"
  fi
 done
	#!/usr/bin/env bash

	# install google utilities
	wget http://commondatastorage.googleapis.com/pub/gsutil.tar.gz
	tar xfz gsutil.tar.gz -C $HOME
	vi ~/.bashrc
	export PATH=${PATH}:$HOME/gsutil
	cd gsutil
	python setup.py install
	gsutil config

	sudo sh
	easy_install bigquery
	bq init
	#!/bin/bash

	set -x

	# Cleanup the mess we've made
	function cleanup_files() {
	rm -f "${TABLE_NAME}.*"
	rm -f "${file_prefix}*"
	}

	TABLE_SCHEMA=$1
	TABLE_NAME=$2
	BIGQUERY_DATASET=$3

	# Customize these for your organization
	BUCKET_PREFIX="eq"
	GZIP_ENABLED="false"

	mytime=$(date '+%y%m%d%H%M')
	#hostname=$(hostname \| tr '[:upper:]' '[:lower:]')
	file_prefix="${TABLE_NAME}${mytime}${TABLE_SCHEMA}"
	bucket_name="${BUCKET_PREFIX}-${file_prefix}"
	splitat="4000000000"
	bulkfiles=200
	maxbad=300

	if [ $# -ne 3 ];then
	echo 'ERROR, must provide two arguments: ./mysql2bq.sh TABLE_SCHEMA TABLE_NAME'
	exit 1
	fi

	# Check if the table has blob data
	cat > blob_query.txt << heredoc
	select sum(IF((DATA_TYPE LIKE '%blob%'),1, 0)) from INFORMATION_SCHEMA.columns where TABLE_SCHEMA = '$TABLE_SCHEMA' AND TABLE_NAME = '$TABLE_NAME'
	heredoc

	if ! mysql -Bs < blob_query.txt; then
	echo "blob column found in table $TABLE_NAME"
	exit 3
	fi

	if ! gsutil mb "gs://$bucket_name"; then
	cleanup_files
	echo "bucket $bucket_name could not be created in cloud"
	exit 4
	fi

	# create JSON schema from mysql table structure
	cat > json_query.txt << heredoc
	select CONCAT('{"name": "', COLUMN_NAME, '","type":"', IF(DATA_TYPE like "%date%", "DATE",IF(DATA_TYPE like "%int%", "INTEGER",IF(DATA_TYPE = "decimal","FLOAT","STRING"))) , '"},') as json from information_schema.columns where TABLE_SCHEMA = '$TABLE_SCHEMA' AND TABLE_NAME = '$TABLE_NAME';
	heredoc

	echo '[' > "$TABLE_NAME.json"
	mysql -Bs < json_query.txt \| sed '$s/,$//' >> "$TABLE_NAME.json"
	mysql "$TABLE_SCHEMA" -Bse"show create table $TABLE_NAME\\G" > "$TABLE_NAME.sql"
	echo ', {"name": "hostname","type":"STRING"} ]' >> "$TABLE_NAME.json"

	# copy json and create table data to cloud
	gsutil cp "$TABLE_NAME.json" "gs://$bucket_name/"
	gsutil cp "$TABLE_NAME.sql" "gs://$bucket_name/"

	# Dump the data, strip \r's, a bunch of sed's, and we're done
	time mysql --quick "$TABLE_SCHEMA" -Bse"select * from $TABLE_NAME" \
	\| sed -e "s/\(\\r\\|\"\\|NULL\\|(Ctrl-v)(Ctrl-m)\)//g;s/$/\\t$TABLE_SCHEMA/" > "$TABLE_NAME.txt"

	# split files with prefix
	time split -C "$splitat" "$TABLE_NAME.txt" "$file_prefix"

	GZIP_FLAGS="-m"
	if [[ "$GZIP_ENABLED" == "true" ]]; then
	GZIP_FLAGS="-m -J"
	fi

	if ! gsutil "$GZIP_FLAGS" cp "$file_prefix"* "gs://$bucket_name/"; then
	cleanup_files
	echo "$file could not be copied to cloud"
	exit 3
	fi

	##
	## Import to BigQuery
	##

	for mylist in $(gsutil ls "gs://$bucket_name/*" \| xargs -n$bulkfiles \| tr ' ', ','); do
	echo "$mylist"

	##
	## Check if we have a DATE column, if we do then we can use partitioning
	##
	BQ_ARGS="--nosync -F \t --job_id=$TABLE_NAME --max_bad_record=$maxbad"

	if grep "DATE" "$TABLE_NAME.json"; then
	FIELD=$(grep "DATE" "$TABLE_NAME.json" \| tail -1 \| cut -d'"' -f4)
	BQ_ARGS+=" --time_partitioning_type=DAY --time_partitioning_field=${FIELD}"
	fi

	if ! bq load $BQ_ARGS "${BIGQUERY_DATASET}.${TABLE_NAME}" "$mylist" "${TABLE_NAME}.json"; then
	echo "bq load failed for $file, check file exist in cloud"
	cleanup_files
	exit 2
	fi
	done
	#!/bin/bash

	BIGQUERY_DATASET=$1
	TABLE_SCHEMA=$2
	LOG_TO_FILE="false"

	if [[ $# -eq 0 ]] ; then
	echo 'ERROR, must provide two arguments: ./wrapper.sh BIGQUERY_DATASET TABLE_SCHEMA'
	exit 0
	fi

	bq mk "$BIGQUERY_DATASET" \|\| true

	# mysqlshow has a structure with a header at the top, we strip this out with a tail
	# and then use awk to get the contents of the table cell instead of the borders
	for tbl_name in $(mysqlshow "$TABLE_SCHEMA" \| tail -n +5 \| awk '{print $2}'); do
	#sh -xv mysql2bq.sh $TABLE_SCHEMA $tbl_name > script_succ.txt 2> script_err.txt
	if [[ "$LOG_TO_FILE" == "true" ]]; then
	./mysql2bq.sh "$TABLE_SCHEMA" "$tbl_name" "$BIGQUERY_DATASET" > script_success.txt 2> script_error.txt
	else
	./mysql2bq.sh "$TABLE_SCHEMA" "$tbl_name" "$BIGQUERY_DATASET"
	fi
	done