-
-
Save shantanuo/4466298 to your computer and use it in GitHub Desktop.
| #!/bin/sh | |
| TABLE_SCHEMA=$1 | |
| TABLE_NAME=$2 | |
| mytime=`date '+%y%m%d%H%M'` | |
| hostname=`hostname | tr 'A-Z' 'a-z'` | |
| file_prefix="trimax$TABLE_NAME$mytime$TABLE_SCHEMA" | |
| bucket_name=$file_prefix | |
| splitat="4000000000" | |
| bulkfiles=200 | |
| maxbad=300 | |
| # make sure schema and table names are supplied | |
| if [ $# -ne 2 ];then | |
| echo "DB and table name required" | |
| exit 1 | |
| fi | |
| # make sure the table does not has blob or text columns | |
| cat > blob_query.txt << heredoc | |
| select sum(IF((DATA_TYPE LIKE '%blob%' OR DATA_TYPE LIKE '%text%'),1, 0)) from INFORMATION_SCHEMA.columns where TABLE_SCHEMA = '$TABLE_SCHEMA' AND TABLE_NAME = '$TABLE_NAME' | |
| heredoc | |
| mycount=`mysql -Bs < blob_query.txt` | |
| if [ $mycount -ne 0 ];then | |
| echo "blob or text column found in table $TABLE_NAME" | |
| exit 3 | |
| fi | |
| # create google cloud bucket | |
| gsutil mb gs://$bucket_name | |
| if [ $? -ne 0 ];then | |
| echo "bucket $bucket_name could not be created in cloud" | |
| exit 4 | |
| fi | |
| # create JSON schema from mysql table structure | |
| cat > json_query.txt << heredoc | |
| select CONCAT('{"name": "', COLUMN_NAME, '","type":"', IF(DATA_TYPE like "%int%", "INTEGER",IF(DATA_TYPE = "decimal","FLOAT","STRING")) , '"},') as json from information_schema.columns where TABLE_SCHEMA = '$TABLE_SCHEMA' AND TABLE_NAME = '$TABLE_NAME'; | |
| heredoc | |
| echo '[' > $TABLE_NAME.json | |
| mysql -Bs < json_query.txt | sed '$s/,$//' >> $TABLE_NAME.json | |
| mysql $TABLE_SCHEMA -Bse"show create table $TABLE_NAME\G" > $TABLE_NAME.sql | |
| echo ', {"name": "hostname","type":"STRING"} ]' >> $TABLE_NAME.json | |
| # copy json and create table data to cloud | |
| gsutil cp $TABLE_NAME.json gs://$bucket_name/ | |
| gsutil cp $TABLE_NAME.sql gs://$bucket_name/ | |
| # dump data | |
| time mysql $TABLE_SCHEMA -Bse"select * from $TABLE_NAME" > $TABLE_NAME.txt1 | |
| tr -d "\r" < $TABLE_NAME.txt1 > $TABLE_NAME.txt | |
| sed -i "s/$/\t$TABLE_SCHEMA/" $TABLE_NAME.txt | |
| sed -i 's/(Ctrl-v)(Ctrl-m)//g' $TABLE_NAME.txt | |
| # split files with prefix | |
| time split -C $splitat $TABLE_NAME.txt $file_prefix | |
| # loop and upload files to google cloud | |
| for file in `ls $file_prefix*` | |
| do | |
| # big query does not seem to like double quotes and NULL | |
| time sed -i -e 's/\"//g' -e's/NULL//g' $file | |
| time gzip $file | |
| # copy to google cloud | |
| time gsutil cp $file.gz gs://$bucket_name/ | |
| if [ $? -ne 0 ];then | |
| echo "$file could not be copied to cloud" | |
| exit 3 | |
| fi | |
| rm -f $file.gz | |
| done | |
| # import data to big query | |
| for mylist in `gsutil ls gs://$bucket_name/*.gz | xargs -n$bulkfiles | tr ' ', ','` | |
| do | |
| echo $mylist | |
| mytime=`date '+%b%d%y'` | |
| time bq mk $mytime | |
| time bq load --nosync -F '\t' --job_id="$file" --max_bad_record=$maxbad $mytime.$TABLE_NAME $mylist $TABLE_NAME.json | |
| if [ $? -ne 0 ];then | |
| echo "bq load failed for $file, check file exist in cloud" | |
| exit 2 | |
| fi | |
| #sleep 35 | |
| done | |
| rm -f $TABLE_NAME.json $TABLE_NAME.sql $TABLE_NAME.txt | |
| exit | |
| #!/bin/sh | |
| TABLE_SCHEMA='drupaldb' | |
| for tbl_name in `mysqlshow $TABLE_SCHEMA | awk '{print $2}'` | |
| do | |
| sh -xv myscript.sh $TABLE_SCHEMA $tbl_name > script_succ.txt 2> script_err.txt | |
| done | |
| # install google utilities | |
| wget http://commondatastorage.googleapis.com/pub/gsutil.tar.gz | |
| tar xfz gsutil.tar.gz -C $HOME | |
| vi ~/.bashrc | |
| export PATH=${PATH}:$HOME/gsutil | |
| cd gsutil | |
| python setup.py install | |
| gsutil config | |
| sudo sh | |
| easy_install bigquery | |
| bq init | |
| # use wget to download this script | |
| wget https://gist.github.com/raw/4466298/8f842e248db27c336f8726116943afaf17d29ffb/mysql_to_big_query.sh | |
Fantastic, ty for posting
I've made a test on a simple table (2 int, 1 date, 1 float) and it fails.
I'm running on MacOsX, maybe the split command have different options ?
[19:49] thomas@busybox:/Volumes/GoogleDrive/My Drive/CRF/RedCrossQuest$ ./mysql_to_big_query.sh rcq_fr_test_db daily_stats_before_rcq
Creating gs://trimaxdaily_stats_before_rcq1811071950rcq_fr_test_db/...
Copying file://daily_stats_before_rcq.json [Content-Type=application/json]...
- [1 files][ 180.0 B/ 180.0 B]
Operation completed over 1 objects/180.0 B.
Copying file://daily_stats_before_rcq.sql [Content-Type=application/x-sql]... - [1 files][ 474.0 B/ 474.0 B]
Operation completed over 1 objects/474.0 B.
real 0m0.089s
user 0m0.012s
sys 0m0.005s
sed: 1: "daily_stats_before_rcq.txt": extra characters at the end of d command
sed: 1: "daily_stats_before_rcq.txt": extra characters at the end of d command
split: illegal option -- C
usage: split [-a sufflen] [-b byte_count] [-l line_count] [-p pattern]
[file [prefix]]
real 0m0.011s
user 0m0.001s
sys 0m0.004s
ls: trimaxdaily_stats_before_rcq1811071950rcq_fr_test_db*: No such file or directory
CommandException: One or more URLs matched no objects.
[19:50] thomas@busybox:/Volumes/GoogleDrive/My Drive/CRF/RedCrossQuest$
brew install coreutils
vi .bash_profile
add this at the end of the file
export PATH="/usr/local/opt/coreutils/libexec/gnubin:$PATH"
and the scripts works :)
<html><h1></h1></html>
It works with text fields, I just removed the part with text data type verification and worked flawlessly!
Excellent script! Congrats!