-
-
Save apenney/2d78fc1fb734bf3f58d5adbd1dfdd461 to your computer and use it in GitHub Desktop.
Copy MySQL table to big query. If you need to copy all tables, use the loop given at the end.Exit with error code 3 if blob or text columns are found. The csv files are first copied to google cloud before being imported to big query.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env bash | |
# install google utilities | |
wget http://commondatastorage.googleapis.com/pub/gsutil.tar.gz | |
tar xfz gsutil.tar.gz -C $HOME | |
vi ~/.bashrc | |
export PATH=${PATH}:$HOME/gsutil | |
cd gsutil | |
python setup.py install | |
gsutil config | |
sudo sh | |
easy_install bigquery | |
bq init |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
set -x | |
# Cleanup the mess we've made | |
function cleanup_files() { | |
rm -f "${TABLE_NAME}.*" | |
rm -f "${file_prefix}*" | |
} | |
TABLE_SCHEMA=$1 | |
TABLE_NAME=$2 | |
BIGQUERY_DATASET=$3 | |
# Customize these for your organization | |
BUCKET_PREFIX="eq" | |
GZIP_ENABLED="false" | |
mytime=$(date '+%y%m%d%H%M') | |
#hostname=$(hostname | tr '[:upper:]' '[:lower:]') | |
file_prefix="${TABLE_NAME}${mytime}${TABLE_SCHEMA}" | |
bucket_name="${BUCKET_PREFIX}-${file_prefix}" | |
splitat="4000000000" | |
bulkfiles=200 | |
maxbad=300 | |
if [ $# -ne 3 ];then | |
echo 'ERROR, must provide two arguments: ./mysql2bq.sh TABLE_SCHEMA TABLE_NAME' | |
exit 1 | |
fi | |
# Check if the table has blob data | |
cat > blob_query.txt << heredoc | |
select sum(IF((DATA_TYPE LIKE '%blob%'),1, 0)) from INFORMATION_SCHEMA.columns where TABLE_SCHEMA = '$TABLE_SCHEMA' AND TABLE_NAME = '$TABLE_NAME' | |
heredoc | |
if ! mysql -Bs < blob_query.txt; then | |
echo "blob column found in table $TABLE_NAME" | |
exit 3 | |
fi | |
if ! gsutil mb "gs://$bucket_name"; then | |
cleanup_files | |
echo "bucket $bucket_name could not be created in cloud" | |
exit 4 | |
fi | |
# create JSON schema from mysql table structure | |
cat > json_query.txt << heredoc | |
select CONCAT('{"name": "', COLUMN_NAME, '","type":"', IF(DATA_TYPE like "%date%", "DATE",IF(DATA_TYPE like "%int%", "INTEGER",IF(DATA_TYPE = "decimal","FLOAT","STRING"))) , '"},') as json from information_schema.columns where TABLE_SCHEMA = '$TABLE_SCHEMA' AND TABLE_NAME = '$TABLE_NAME'; | |
heredoc | |
echo '[' > "$TABLE_NAME.json" | |
mysql -Bs < json_query.txt | sed '$s/,$//' >> "$TABLE_NAME.json" | |
mysql "$TABLE_SCHEMA" -Bse"show create table $TABLE_NAME\\G" > "$TABLE_NAME.sql" | |
echo ', {"name": "hostname","type":"STRING"} ]' >> "$TABLE_NAME.json" | |
# copy json and create table data to cloud | |
gsutil cp "$TABLE_NAME.json" "gs://$bucket_name/" | |
gsutil cp "$TABLE_NAME.sql" "gs://$bucket_name/" | |
# Dump the data, strip \r's, a bunch of sed's, and we're done | |
time mysql --quick "$TABLE_SCHEMA" -Bse"select * from $TABLE_NAME" \ | |
| sed -e "s/\(\\r\|\"\|NULL\|(Ctrl-v)(Ctrl-m)\)//g;s/$/\\t$TABLE_SCHEMA/" > "$TABLE_NAME.txt" | |
# split files with prefix | |
time split -C "$splitat" "$TABLE_NAME.txt" "$file_prefix" | |
GZIP_FLAGS="-m" | |
if [[ "$GZIP_ENABLED" == "true" ]]; then | |
GZIP_FLAGS="-m -J" | |
fi | |
if ! gsutil "$GZIP_FLAGS" cp "$file_prefix"* "gs://$bucket_name/"; then | |
cleanup_files | |
echo "$file could not be copied to cloud" | |
exit 3 | |
fi | |
## | |
## Import to BigQuery | |
## | |
for mylist in $(gsutil ls "gs://$bucket_name/*" | xargs -n$bulkfiles | tr ' ', ','); do | |
echo "$mylist" | |
## | |
## Check if we have a DATE column, if we do then we can use partitioning | |
## | |
BQ_ARGS="--nosync -F \t --job_id=$TABLE_NAME --max_bad_record=$maxbad" | |
if grep "DATE" "$TABLE_NAME.json"; then | |
FIELD=$(grep "DATE" "$TABLE_NAME.json" | tail -1 | cut -d'"' -f4) | |
BQ_ARGS+=" --time_partitioning_type=DAY --time_partitioning_field=${FIELD}" | |
fi | |
if ! bq load $BQ_ARGS "${BIGQUERY_DATASET}.${TABLE_NAME}" "$mylist" "${TABLE_NAME}.json"; then | |
echo "bq load failed for $file, check file exist in cloud" | |
cleanup_files | |
exit 2 | |
fi | |
done |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
BIGQUERY_DATASET=$1 | |
TABLE_SCHEMA=$2 | |
LOG_TO_FILE="false" | |
if [[ $# -eq 0 ]] ; then | |
echo 'ERROR, must provide two arguments: ./wrapper.sh BIGQUERY_DATASET TABLE_SCHEMA' | |
exit 0 | |
fi | |
bq mk "$BIGQUERY_DATASET" || true | |
# mysqlshow has a structure with a header at the top, we strip this out with a tail | |
# and then use awk to get the contents of the table cell instead of the borders | |
for tbl_name in $(mysqlshow "$TABLE_SCHEMA" | tail -n +5 | awk '{print $2}'); do | |
#sh -xv mysql2bq.sh $TABLE_SCHEMA $tbl_name > script_succ.txt 2> script_err.txt | |
if [[ "$LOG_TO_FILE" == "true" ]]; then | |
./mysql2bq.sh "$TABLE_SCHEMA" "$tbl_name" "$BIGQUERY_DATASET" > script_success.txt 2> script_error.txt | |
else | |
./mysql2bq.sh "$TABLE_SCHEMA" "$tbl_name" "$BIGQUERY_DATASET" | |
fi | |
done |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment