Created
June 3, 2012 14:00
-
-
Save gerigk/2863607 to your computer and use it in GitHub Desktop.
Bootstrap file to load binaries for pandas and dependencies
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
################### | |
#configuration here | |
#################### | |
bucketname="your_bucket_name" | |
########################## | |
cd /home/hadoop | |
#first we set two vars...I had errors without this | |
export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH | |
export LD_RUN_PATH=/usr/local/lib:$LD_RUN_PATH | |
#sqlite3 is needed for python but somehow isnt detected | |
hadoop fs -get s3://${bucketname}/emr_resources/python_binaries/sqlite.tar.gz sqlite.tar.gz | |
tar -vxf sqlite.tar.gz | |
cd sqlite-autoconf-3070603 | |
sudo make install | |
cd .. | |
#now python itself | |
hadoop fs -get s3://${bucketname}/emr_resources/python_binaries/Python-2.7.3.tar.gz Python-2.7.3.tar.gz | |
tar -vxf Python-2.7.3.tar.gz | |
cd Python-2.7.3 | |
#now we install python to build the necessary libraries | |
sudo make install | |
# we have to create the links | |
sudo rm /usr/bin/python | |
sudo ln -s /usr/bin/python2.7 /usr/bin/python | |
cd .. | |
# install setup tools | |
wget http://peak.telecommunity.com/dist/ez_setup.py | |
sudo python ez_setup.py | |
# now atlas | |
hadoop fs -get s3://${bucketname}/emr_resources/libraries/ATLAS.tar.gz ATLAS.tar.gz | |
tar -vxf ATLAS.tar.gz | |
cd ATLAS/build | |
sudo make install | |
cd .. | |
cd .. | |
#hdf5 | |
# first szlib | |
hadoop fs -get s3://${bucketname}/emr_resources/libraries/szip-2.1.tar.gz szip-2.1.tar.gz | |
tar -vxf szip-2.1.tar.gz | |
cd szip-2.1 | |
sudo make install | |
cd .. | |
##### now hdf5 | |
hadoop fs -get s3://${bucketname}/emr_resources/libraries/hdf5-1.8.9.tar.gz hdf5-1.8.9.tar.gz | |
tar -vxf hdf5-1.8.9.tar.gz | |
cd hdf5-1.8.9/build | |
sudo make install | |
cd .. | |
cd .. | |
#mrjob is needed of course. simplejson, boto and pyyaml are installed on the way | |
# those are fine because it doesnt take long to install them. feel free to add binaries | |
# for pyyaml with libyaml if you need the speed | |
hadoop fs -get s3://${bucketname}/emr_resources/python_packages/mrjob.tar.gz mrjob.tar.gz | |
tar -vxf mrjob.tar.gz | |
#### this prepared mrjob to be installed next time. mrjob is python code only | |
#### now it is installed so you can run a job already in this session | |
cd mrjob | |
sudo python setup.py install | |
cd .. | |
# cython is needed for pandas | |
hadoop fs -get s3://${bucketname}/emr_resources/python_packages/Cython-0.16.tar.gz Cython-0.16.tar.gz | |
tar -vxf Cython-0.16.tar.gz | |
# this prepared the binaries. | |
# now we install in order to build pandas | |
cd Cython-0.16 | |
sudo python setup.py install | |
cd .. | |
# dateutil also needed for pandas | |
hadoop fs -get s3://${bucketname}/emr_resources/python_packages/python-dateutil-1.5.tar.gz python-dateutil-1.5.tar.gz | |
#dateutil doesn't contain any non-python code | |
# now we install it for pandas | |
tar -vxf python-dateutil-1.5.tar.gz | |
cd python-dateutil-1.5 | |
sudo python setup.py install | |
cd .. | |
# the same with pytz | |
hadoop fs -get s3://${bucketname}/emr_resources/python_packages/pytz-2012c.tar.gz pytz-2012c.tar.gz | |
#install for the pandas tests | |
tar -vxf pytz-2012c.tar.gz | |
cd pytz-2012c | |
sudo python setup.py install | |
cd .. | |
################## we're close | |
################### time for numpy | |
hadoop fs -get s3://${bucketname}/emr_resources/python_packages/numpy.tar.gz numpy.tar.gz | |
tar -vxf numpy.tar.gz | |
# now we install numpy for the pandas build | |
cd numpy | |
sudo python setup.py install | |
cd .. | |
#scipy | |
hadoop fs -get s3://${bucketname}/emr_resources/python_packages/scipy-0.10.1.tar.gz scipy-0.10.1.tar.gz | |
tar -vxf scipy-0.10.1.tar.gz | |
cd scipy-0.10.1 | |
sudo python setup.py install | |
cd .. | |
#numexpr | |
hadoop fs -get s3://${bucketname}/emr_resources/python_packages/numexpr-2.0.1.tar.gz numexpr-2.0.1.tar.gz | |
tar -vxf numexpr-2.0.1.tar.gz | |
cd numexpr-2.0.1 | |
sudo python setup.py install | |
cd .. | |
#pytables | |
#lzo compression | |
hadoop fs -get s3://${bucketname}/emr_resources/libraries/lzo-2.06.tar.gz lzo-2.06.tar.gz | |
tar -vxf lzo-2.06.tar.gz | |
cd lzo-2.06 | |
sudo make install | |
cd .. | |
### now pytables | |
hadoop fs -get s3://${bucketname}/emr_resources/python_packages/tables-2.3.1.tar.gz tables-2.3.1.tar.gz | |
tar -vxf tables-2.3.1.tar.gz | |
cd tables-2.3.1 | |
sudo python setup.py install | |
cd .. | |
# nosetests to see whether everything went right | |
wget http://pypi.python.org/packages/source/n/nose/nose-1.1.2.tar.gz | |
tar -vxf nose-1.1.2.tar.gz | |
cd nose-1.1.2 | |
sudo python setup.py install | |
cd .. | |
# and pandas | |
hadoop fs -get s3://${bucketname}/emr_resources/python_packages/pandas.tar.gz pandas.tar.gz | |
tar -vxf pandas.tar.gz | |
cd pandas | |
sudo python setup.py install | |
# create a unique filename | |
unique=`ip addr show dev eth0 | grep ether | tr -s ' ' | cut -d' ' -f 3 | tr -d ':'` | |
nosetests pandas >${unique}.txt 2>&1 | |
#upload test results to s3. this is nice if you start hundreds of instances and you still want to know that this setup produces | |
#a working pandas build | |
hadoop fs -put ${unique}.txt s3://${bucketname}/emr_resources/python_packages/pandas_tests/${unique}.txt | |
exit |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment