Created
June 3, 2012 13:48
-
-
Save gerigk/2863552 to your computer and use it in GitHub Desktop.
Build binaries to run Pandas with EMR
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
################### | |
#configuration here | |
#################### | |
bucketname="my_bucket_name" | |
########################## | |
cd /home/hadoop | |
#first we set two vars...I had errors without this | |
export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH | |
export LD_RUN_PATH=/usr/local/lib:$LD_RUN_PATH | |
#sqlite3 is needed for python but somehow isnt detected | |
wget http://www.sqlite.org/sqlite-autoconf-3070603.tar.gz | |
tar -vxf sqlite-autoconf-3070603.tar.gz | |
cd sqlite-autoconf-3070603 | |
./configure | |
make | |
cd .. | |
tar -czf sqlite.tar.gz sqlite-autoconf-3070603 | |
hadoop fs -put sqlite.tar.gz s3://${bucketname}/emr_resources/python_binaries/sqlite.tar.gz | |
cd sqlite-autoconf-3070603 | |
sudo make install | |
cd .. | |
#now python itself | |
wget http://www.python.org/ftp/python/2.7.3/Python-2.7.3.tar.bz2 | |
tar jfx Python-2.7.3.tar.bz2 | |
cd Python-2.7.3 | |
./configure | |
## I had a weird pandas build error when enabling --with-py-debug | |
make -s | |
cd .. | |
tar -czf Python-2.7.3.tar.gz Python-2.7.3 | |
hadoop fs -put Python-2.7.3.tar.gz s3://${bucketname}/emr_resources/python_binaries/Python-2.7.3.tar.gz | |
######## this uploads the binaries for the next time we run the normal bootstrap script | |
cd Python-2.7.3 | |
#now we install python to build the necessary libraries | |
sudo make install | |
# make our version the default one | |
sudo rm /usr/bin/python | |
sudo ln -s /usr/bin/python2.7 /usr/bin/python | |
# if you build python using --enable-shared then you have to set these links | |
#sudo ln -s /usr/local/lib/libpython2.7.so.1.0 /usr/lib/ | |
#sudo ln -s /usr/local/lib/libpython2.7.so /usr/ | |
cd .. | |
# install setup tools | |
wget http://peak.telecommunity.com/dist/ez_setup.py | |
sudo python ez_setup.py | |
#use the following only if you have a lot of time. also because of cpu throttling I wouldn't recommend building on ec2 | |
# unless you find a way to turn it off. I used a machine that runs debian stable and the same version of gcc. | |
# with later gccs it won't run on ec2 | |
#wget http://www.netlib.org/lapack/lapack-3.4.1.tgz | |
#wget http://downloads.sourceforge.net/project/math-atlas/Developer%20%28unstable%29/3.9.76/atlas3.9.76.tar.bz2 | |
#tar -vxf atlas3.9.76.tar.bz2 | |
#cd ATLAS | |
#mkdir build | |
#cd build | |
################################## -t 2 means 2 threads. depending on the ec2 instance you can choose more threads 14 | |
### V 448 means SSE1/2/3 support. A14 means x86SSE364SSE2 architecture. check the documentation for more information | |
#../configure -b 64 -V 448 -A 14 -t 2 --with-netlib-lapack-tarfile=/home/$USER/lapack-3.4.1.tgz --shared | |
#make | |
#make check | |
#make time | |
#cd lib | |
#make shared | |
#make ptshared | |
#cd .. | |
#cd .. | |
#cd .. | |
#tar -czf ATLAS.tar.gz ATLAS | |
#hadoop fs -put ATLAS.tar.gz s3://${bucketname}/emr_resources/libraries/ATLAS.tar.gz | |
# start here if you downloaded my binaries or built them yourself. | |
hadoop fs -get s3://${bucketname}/emr_resources/libraries/ATLAS.tar.gz ATLAS.tar.gz | |
tar -vxf ATLAS.tar.gz | |
cd ATLAS/build | |
sudo make install | |
cd .. | |
cd .. | |
#hdf5 | |
# first szlib | |
wget http://www.hdfgroup.org/ftp/lib-external/szip/2.1/src/szip-2.1.tar.gz | |
tar -vxf szip-2.1.tar.gz | |
cd szip-2.1 | |
./configure | |
make | |
cd .. | |
tar -czf szip-2.1.tar.gz szip-2.1 | |
hadoop fs -put szip-2.1.tar.gz s3://${bucketname}/emr_resources/libraries/szip-2.1.tar.gz | |
cd szip-2.1 | |
sudo make install | |
cd .. | |
##### now hdf5 | |
wget http://www.hdfgroup.org/ftp/HDF5/current/src/hdf5-1.8.9.tar.gz | |
tar -vxf hdf5-1.8.9.tar.gz | |
cd hdf5-1.8.9 | |
mkdir build | |
cd build | |
../configure --prefix=/usr/local --enable-fortran --enable-cxx --with-szlib=/home/hadoop/szip-2.1/szip/lib | |
make | |
cd .. | |
cd .. | |
tar -czf hdf5-1.8.9.tar.gz hdf5-1.8.9 | |
hadoop fs -put hdf5-1.8.9.tar.gz s3://${bucketname}/emr_resources/libraries/hdf5-1.8.9.tar.gz | |
cd hdf5-1.8.9/build | |
sudo make install | |
cd .. | |
cd .. | |
#mrjob is needed of course. simplejson, boto and pyyaml are installed on the way | |
# those are fine because it doesnt take long to install them. feel free to add binaries | |
# for pyyaml with libyaml if you need the speed | |
git clone https://github.com/Yelp/mrjob.git | |
tar -czf mrjob.tar.gz mrjob | |
hadoop fs -put mrjob.tar.gz s3://${bucketname}/emr_resources/python_packages/mrjob.tar.gz | |
#### this prepared mrjob to be installed next time. mrjob is python code only | |
#### now it is installed so you can run a job already in this session | |
cd mrjob | |
sudo python setup.py install | |
cd .. | |
# cython is needed in order to build pandas | |
wget http://cython.org/release/Cython-0.16.tar.gz | |
tar -vxf Cython-0.16.tar.gz | |
cd Cython-0.16 | |
python setup.py build | |
cd .. | |
tar -czf Cython-0.16.tar.gz Cython-0.16 | |
hadoop fs -put Cython-0.16.tar.gz s3://${bucketname}/emr_resources/python_packages/Cython-0.16.tar.gz | |
# this prepared the binaries. | |
# now we install in order to build pandas | |
cd Cython-0.16 | |
sudo python setup.py install | |
cd .. | |
# dateutil also needed for pandas | |
wget http://labix.org/download/python-dateutil/python-dateutil-1.5.tar.gz | |
hadoop fs -put python-dateutil-1.5.tar.gz s3://${bucketname}/emr_resources/python_packages/python-dateutil-1.5.tar.gz | |
#dateutil doesn't contain any non-python code | |
#now we install it for pandas | |
tar -vxf python-dateutil-1.5.tar.gz | |
cd python-dateutil-1.5 | |
sudo python setup.py install | |
cd .. | |
# the same with pytz | |
wget http://pypi.python.org/packages/source/p/pytz/pytz-2012c.tar.gz | |
hadoop fs -put pytz-2012c.tar.gz s3://${bucketname}/emr_resources/python_packages/pytz-2012c.tar.gz | |
#install for the pandas tests | |
tar -vxf pytz-2012c.tar.gz | |
cd pytz-2012c | |
sudo python setup.py install | |
cd .. | |
################## we're close | |
################### time for numpy | |
### if you don't trust numpy 1.7 yet change the following lines. | |
#wget http://sourceforge.net/projects/numpy/files/NumPy/1.6.2/numpy-1.6.2.tar.gz | |
#tar -vxf numpy-1.6.2.tar.gz | |
#cd numpy-1.6.2 | |
git clone https://github.com/numpy/numpy.git | |
cd numpy | |
# create the site.cfg so numpy builds with atlas | |
cat >site.cfg <<HEREDOC | |
[DEFAULT] | |
library_dirs = /usr/local/atlas/lib | |
include_dirs = /usr/local/atlas/include | |
[blas_opt] | |
libraries = ptf77blas, ptcblas, atlas | |
[lapack_opt] | |
libraries = lapack, ptf77blas, ptcblas, atlas | |
HEREDOC | |
#and finally build numpy | |
python setup.py build | |
cd .. | |
tar -czf numpy.tar.gz numpy | |
hadoop fs -put numpy.tar.gz s3://${bucketname}/emr_resources/python_packages/numpy.tar.gz | |
# now we install numpy for the pandas build | |
cd numpy | |
sudo python setup.py install | |
cd .. | |
# again if you want to use stable numpy use these lines | |
#tar -czf numpy-1.6.2.tar.gz numpy-1.6.2 | |
#hadoop fs -put numpy-1.6.2.tar.gz s3://${bucketname}/emr_resources/python_packages/numpy-1.6.2.tar.gz | |
# now we install numpy for the pandas build | |
#cd numpy-1.6.2 | |
#sudo python setup.py install | |
#cd .. | |
#scipy | |
wget http://sourceforge.net/projects/scipy/files/scipy/0.10.1/scipy-0.10.1.tar.gz | |
tar -vxf scipy-0.10.1.tar.gz | |
cd scipy-0.10.1 | |
python setup.py build | |
cd .. | |
tar -czf scipy-0.10.1.tar.gz scipy-0.10.1 | |
hadoop fs -put scipy-0.10.1.tar.gz s3://${bucketname}/emr_resources/python_packages/scipy-0.10.1.tar.gz | |
cd scipy-0.10.1 | |
sudo python setup.py install | |
cd .. | |
#numexpr | |
wget http://numexpr.googlecode.com/files/numexpr-2.0.1.tar.gz | |
tar -vxf numexpr-2.0.1.tar.gz | |
cd numexpr-2.0.1 | |
python setup.py build | |
cd .. | |
tar -czf numexpr-2.0.1.tar.gz numexpr-2.0.1 | |
hadoop fs -put numexpr-2.0.1.tar.gz s3://${bucketname}/emr_resources/python_packages/numexpr-2.0.1.tar.gz | |
cd numexpr-2.0.1 | |
sudo python setup.py install | |
cd .. | |
#pytables | |
#lzo compression | |
wget http://www.oberhumer.com/opensource/lzo/download/lzo-2.06.tar.gz | |
tar -vxf lzo-2.06.tar.gz | |
cd lzo-2.06 | |
./configure --enable-shared | |
make | |
cd .. | |
tar -czf lzo-2.06.tar.gz lzo-2.06 | |
hadoop fs -put lzo-2.06.tar.gz s3://${bucketname}/emr_resources/libraries/lzo-2.06.tar.gz | |
cd lzo-2.06 | |
sudo make install | |
cd .. | |
### now pytables | |
wget http://downloads.sourceforge.net/project/pytables/pytables/2.3.1/tables-2.3.1.tar.gz | |
tar -vxf tables-2.3.1.tar.gz | |
cd tables-2.3.1 | |
python setup.py build | |
cd .. | |
tar -czf tables-2.3.1.tar.gz tables-2.3.1 | |
hadoop fs -put tables-2.3.1.tar.gz s3://${bucketname}/emr_resources/python_packages/tables-2.3.1.tar.gz | |
cd tables-2.3.1 | |
sudo python setup.py install | |
cd .. | |
# nosetests to see whether everything went right | |
wget http://pypi.python.org/packages/source/n/nose/nose-1.1.2.tar.gz | |
tar -vxf nose-1.1.2.tar.gz | |
cd nose-1.1.2 | |
sudo python setup.py install | |
cd .. | |
# and pandas | |
git clone https://github.com/pydata/pandas.git | |
cd pandas | |
python setup.py build | |
python setup.py build_ext --inplace | |
cd .. | |
tar -czf pandas.tar.gz pandas | |
hadoop fs -put pandas.tar.gz s3://${bucketname}/emr_resources/python_packages/pandas.tar.gz | |
cd pandas | |
sudo python setup.py install | |
# create a unique filename | |
unique=`ip addr show dev eth0 | grep ether | tr -s ' ' | cut -d' ' -f 3 | tr -d ':'` | |
nosetests pandas >${unique}.txt 2>&1 | |
#upload the result to s3 | |
hadoop fs -put ${unique}.txt s3://${bucketname}/emr_resources/python_packages/pandas_tests/${unique}.txt | |
exit |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment