Last active
September 14, 2022 07:12
-
-
Save shantanuo/4466298 to your computer and use it in GitHub Desktop.
Copy MySQL table to big query. If you need to copy all tables, use the loop given at the end.
Exit with error code 3 if blob or text columns are found. The csv files are first copied to google cloud before being imported to big query.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/sh | |
TABLE_SCHEMA=$1 | |
TABLE_NAME=$2 | |
mytime=`date '+%y%m%d%H%M'` | |
hostname=`hostname | tr 'A-Z' 'a-z'` | |
file_prefix="trimax$TABLE_NAME$mytime$TABLE_SCHEMA" | |
bucket_name=$file_prefix | |
splitat="4000000000" | |
bulkfiles=200 | |
maxbad=300 | |
# make sure schema and table names are supplied | |
if [ $# -ne 2 ];then | |
echo "DB and table name required" | |
exit 1 | |
fi | |
# make sure the table does not has blob or text columns | |
cat > blob_query.txt << heredoc | |
select sum(IF((DATA_TYPE LIKE '%blob%' OR DATA_TYPE LIKE '%text%'),1, 0)) from INFORMATION_SCHEMA.columns where TABLE_SCHEMA = '$TABLE_SCHEMA' AND TABLE_NAME = '$TABLE_NAME' | |
heredoc | |
mycount=`mysql -Bs < blob_query.txt` | |
if [ $mycount -ne 0 ];then | |
echo "blob or text column found in table $TABLE_NAME" | |
exit 3 | |
fi | |
# create google cloud bucket | |
gsutil mb gs://$bucket_name | |
if [ $? -ne 0 ];then | |
echo "bucket $bucket_name could not be created in cloud" | |
exit 4 | |
fi | |
# create JSON schema from mysql table structure | |
cat > json_query.txt << heredoc | |
select CONCAT('{"name": "', COLUMN_NAME, '","type":"', IF(DATA_TYPE like "%int%", "INTEGER",IF(DATA_TYPE = "decimal","FLOAT","STRING")) , '"},') as json from information_schema.columns where TABLE_SCHEMA = '$TABLE_SCHEMA' AND TABLE_NAME = '$TABLE_NAME'; | |
heredoc | |
echo '[' > $TABLE_NAME.json | |
mysql -Bs < json_query.txt | sed '$s/,$//' >> $TABLE_NAME.json | |
mysql $TABLE_SCHEMA -Bse"show create table $TABLE_NAME\G" > $TABLE_NAME.sql | |
echo ', {"name": "hostname","type":"STRING"} ]' >> $TABLE_NAME.json | |
# copy json and create table data to cloud | |
gsutil cp $TABLE_NAME.json gs://$bucket_name/ | |
gsutil cp $TABLE_NAME.sql gs://$bucket_name/ | |
# dump data | |
time mysql $TABLE_SCHEMA -Bse"select * from $TABLE_NAME" > $TABLE_NAME.txt1 | |
tr -d "\r" < $TABLE_NAME.txt1 > $TABLE_NAME.txt | |
sed -i "s/$/\t$TABLE_SCHEMA/" $TABLE_NAME.txt | |
sed -i 's/(Ctrl-v)(Ctrl-m)//g' $TABLE_NAME.txt | |
# split files with prefix | |
time split -C $splitat $TABLE_NAME.txt $file_prefix | |
# loop and upload files to google cloud | |
for file in `ls $file_prefix*` | |
do | |
# big query does not seem to like double quotes and NULL | |
time sed -i -e 's/\"//g' -e's/NULL//g' $file | |
time gzip $file | |
# copy to google cloud | |
time gsutil cp $file.gz gs://$bucket_name/ | |
if [ $? -ne 0 ];then | |
echo "$file could not be copied to cloud" | |
exit 3 | |
fi | |
rm -f $file.gz | |
done | |
# import data to big query | |
for mylist in `gsutil ls gs://$bucket_name/*.gz | xargs -n$bulkfiles | tr ' ', ','` | |
do | |
echo $mylist | |
mytime=`date '+%b%d%y'` | |
time bq mk $mytime | |
time bq load --nosync -F '\t' --job_id="$file" --max_bad_record=$maxbad $mytime.$TABLE_NAME $mylist $TABLE_NAME.json | |
if [ $? -ne 0 ];then | |
echo "bq load failed for $file, check file exist in cloud" | |
exit 2 | |
fi | |
#sleep 35 | |
done | |
rm -f $TABLE_NAME.json $TABLE_NAME.sql $TABLE_NAME.txt | |
exit | |
#!/bin/sh | |
TABLE_SCHEMA='drupaldb' | |
for tbl_name in `mysqlshow $TABLE_SCHEMA | awk '{print $2}'` | |
do | |
sh -xv myscript.sh $TABLE_SCHEMA $tbl_name > script_succ.txt 2> script_err.txt | |
done | |
# install google utilities | |
wget http://commondatastorage.googleapis.com/pub/gsutil.tar.gz | |
tar xfz gsutil.tar.gz -C $HOME | |
vi ~/.bashrc | |
export PATH=${PATH}:$HOME/gsutil | |
cd gsutil | |
python setup.py install | |
gsutil config | |
sudo sh | |
easy_install bigquery | |
bq init | |
# use wget to download this script | |
wget https://gist.github.com/raw/4466298/8f842e248db27c336f8726116943afaf17d29ffb/mysql_to_big_query.sh | |
brew install coreutils
vi .bash_profile
add this at the end of the file
export PATH="/usr/local/opt/coreutils/libexec/gnubin:$PATH"
and the scripts works :)
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
I've made a test on a simple table (2 int, 1 date, 1 float) and it fails.
I'm running on MacOsX, maybe the split command have different options ?
[19:49] thomas@busybox:/Volumes/GoogleDrive/My Drive/CRF/RedCrossQuest$ ./mysql_to_big_query.sh rcq_fr_test_db daily_stats_before_rcq
Creating gs://trimaxdaily_stats_before_rcq1811071950rcq_fr_test_db/...
Copying file://daily_stats_before_rcq.json [Content-Type=application/json]...
Operation completed over 1 objects/180.0 B.
Copying file://daily_stats_before_rcq.sql [Content-Type=application/x-sql]...
Operation completed over 1 objects/474.0 B.
real 0m0.089s
user 0m0.012s
sys 0m0.005s
sed: 1: "daily_stats_before_rcq.txt": extra characters at the end of d command
sed: 1: "daily_stats_before_rcq.txt": extra characters at the end of d command
split: illegal option -- C
usage: split [-a sufflen] [-b byte_count] [-l line_count] [-p pattern]
[file [prefix]]
real 0m0.011s
user 0m0.001s
sys 0m0.004s
ls: trimaxdaily_stats_before_rcq1811071950rcq_fr_test_db*: No such file or directory
CommandException: One or more URLs matched no objects.
[19:50] thomas@busybox:/Volumes/GoogleDrive/My Drive/CRF/RedCrossQuest$