-
-
Save shantanuo/4466298 to your computer and use it in GitHub Desktop.
#!/bin/sh | |
TABLE_SCHEMA=$1 | |
TABLE_NAME=$2 | |
mytime=`date '+%y%m%d%H%M'` | |
hostname=`hostname | tr 'A-Z' 'a-z'` | |
file_prefix="trimax$TABLE_NAME$mytime$TABLE_SCHEMA" | |
bucket_name=$file_prefix | |
splitat="4000000000" | |
bulkfiles=200 | |
maxbad=300 | |
# make sure schema and table names are supplied | |
if [ $# -ne 2 ];then | |
echo "DB and table name required" | |
exit 1 | |
fi | |
# make sure the table does not has blob or text columns | |
cat > blob_query.txt << heredoc | |
select sum(IF((DATA_TYPE LIKE '%blob%' OR DATA_TYPE LIKE '%text%'),1, 0)) from INFORMATION_SCHEMA.columns where TABLE_SCHEMA = '$TABLE_SCHEMA' AND TABLE_NAME = '$TABLE_NAME' | |
heredoc | |
mycount=`mysql -Bs < blob_query.txt` | |
if [ $mycount -ne 0 ];then | |
echo "blob or text column found in table $TABLE_NAME" | |
exit 3 | |
fi | |
# create google cloud bucket | |
gsutil mb gs://$bucket_name | |
if [ $? -ne 0 ];then | |
echo "bucket $bucket_name could not be created in cloud" | |
exit 4 | |
fi | |
# create JSON schema from mysql table structure | |
cat > json_query.txt << heredoc | |
select CONCAT('{"name": "', COLUMN_NAME, '","type":"', IF(DATA_TYPE like "%int%", "INTEGER",IF(DATA_TYPE = "decimal","FLOAT","STRING")) , '"},') as json from information_schema.columns where TABLE_SCHEMA = '$TABLE_SCHEMA' AND TABLE_NAME = '$TABLE_NAME'; | |
heredoc | |
echo '[' > $TABLE_NAME.json | |
mysql -Bs < json_query.txt | sed '$s/,$//' >> $TABLE_NAME.json | |
mysql $TABLE_SCHEMA -Bse"show create table $TABLE_NAME\G" > $TABLE_NAME.sql | |
echo ', {"name": "hostname","type":"STRING"} ]' >> $TABLE_NAME.json | |
# copy json and create table data to cloud | |
gsutil cp $TABLE_NAME.json gs://$bucket_name/ | |
gsutil cp $TABLE_NAME.sql gs://$bucket_name/ | |
# dump data | |
time mysql $TABLE_SCHEMA -Bse"select * from $TABLE_NAME" > $TABLE_NAME.txt1 | |
tr -d "\r" < $TABLE_NAME.txt1 > $TABLE_NAME.txt | |
sed -i "s/$/\t$TABLE_SCHEMA/" $TABLE_NAME.txt | |
sed -i 's/(Ctrl-v)(Ctrl-m)//g' $TABLE_NAME.txt | |
# split files with prefix | |
time split -C $splitat $TABLE_NAME.txt $file_prefix | |
# loop and upload files to google cloud | |
for file in `ls $file_prefix*` | |
do | |
# big query does not seem to like double quotes and NULL | |
time sed -i -e 's/\"//g' -e's/NULL//g' $file | |
time gzip $file | |
# copy to google cloud | |
time gsutil cp $file.gz gs://$bucket_name/ | |
if [ $? -ne 0 ];then | |
echo "$file could not be copied to cloud" | |
exit 3 | |
fi | |
rm -f $file.gz | |
done | |
# import data to big query | |
for mylist in `gsutil ls gs://$bucket_name/*.gz | xargs -n$bulkfiles | tr ' ', ','` | |
do | |
echo $mylist | |
mytime=`date '+%b%d%y'` | |
time bq mk $mytime | |
time bq load --nosync -F '\t' --job_id="$file" --max_bad_record=$maxbad $mytime.$TABLE_NAME $mylist $TABLE_NAME.json | |
if [ $? -ne 0 ];then | |
echo "bq load failed for $file, check file exist in cloud" | |
exit 2 | |
fi | |
#sleep 35 | |
done | |
rm -f $TABLE_NAME.json $TABLE_NAME.sql $TABLE_NAME.txt | |
exit | |
#!/bin/sh | |
TABLE_SCHEMA='drupaldb' | |
for tbl_name in `mysqlshow $TABLE_SCHEMA | awk '{print $2}'` | |
do | |
sh -xv myscript.sh $TABLE_SCHEMA $tbl_name > script_succ.txt 2> script_err.txt | |
done | |
# install google utilities | |
wget http://commondatastorage.googleapis.com/pub/gsutil.tar.gz | |
tar xfz gsutil.tar.gz -C $HOME | |
vi ~/.bashrc | |
export PATH=${PATH}:$HOME/gsutil | |
cd gsutil | |
python setup.py install | |
gsutil config | |
sudo sh | |
easy_install bigquery | |
bq init | |
# use wget to download this script | |
wget https://gist.github.com/raw/4466298/8f842e248db27c336f8726116943afaf17d29ffb/mysql_to_big_query.sh | |
I found the solution: use --quick
option
Excellent script. Worked flawlessly other than specifying username and password for mysql. One minor thing is that bucketname should be converted to lowercase because google cloud storage doesn't like caps
why does it not work with Text-fields? BigQuery supports Text-fields of variable length, or not?
It works with text fields, I just removed the part with text data type verification and worked flawlessly!
Excellent script! Congrats!
Fantastic, ty for posting
I've made a test on a simple table (2 int, 1 date, 1 float) and it fails.
I'm running on MacOsX, maybe the split command have different options ?
[19:49] thomas@busybox:/Volumes/GoogleDrive/My Drive/CRF/RedCrossQuest$ ./mysql_to_big_query.sh rcq_fr_test_db daily_stats_before_rcq
Creating gs://trimaxdaily_stats_before_rcq1811071950rcq_fr_test_db/...
Copying file://daily_stats_before_rcq.json [Content-Type=application/json]...
- [1 files][ 180.0 B/ 180.0 B]
Operation completed over 1 objects/180.0 B.
Copying file://daily_stats_before_rcq.sql [Content-Type=application/x-sql]... - [1 files][ 474.0 B/ 474.0 B]
Operation completed over 1 objects/474.0 B.
real 0m0.089s
user 0m0.012s
sys 0m0.005s
sed: 1: "daily_stats_before_rcq.txt": extra characters at the end of d command
sed: 1: "daily_stats_before_rcq.txt": extra characters at the end of d command
split: illegal option -- C
usage: split [-a sufflen] [-b byte_count] [-l line_count] [-p pattern]
[file [prefix]]
real 0m0.011s
user 0m0.001s
sys 0m0.004s
ls: trimaxdaily_stats_before_rcq1811071950rcq_fr_test_db*: No such file or directory
CommandException: One or more URLs matched no objects.
[19:50] thomas@busybox:/Volumes/GoogleDrive/My Drive/CRF/RedCrossQuest$
brew install coreutils
vi .bash_profile
add this at the end of the file
export PATH="/usr/local/opt/coreutils/libexec/gnubin:$PATH"
and the scripts works :)
This is a nice script. It is working fine for me for small tables. Command get killed when running this line for a table with 300 million rows
time mysql $TABLE_SCHEMA -Bse"select * from $TABLE_NAME" > $TABLE_NAME.txt1
Any idea how to avoid that?