Last active
December 7, 2016 05:50
-
-
Save sangheestyle/5529bc664098a7f4e3544ade2919988b to your computer and use it in GitHub Desktop.
Create python dependency directory for using scikit-learn on AWS lambda Raw
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
set -ex | |
set -o pipefail | |
yum update -y | |
yum install -y \ | |
atlas-sse3-devel \ | |
blas-devel \ | |
gcc \ | |
gcc-c++ \ | |
lapack-devel \ | |
python27-devel \ | |
python-pip \ | |
python-wheel | |
cd ~ | |
# numpy-site.cfg is required to use custom runtime_library_dirs | |
# https://github.com/numpy/numpy/blob/master/site.cfg.example | |
( | |
echo [atlas] | |
echo libraries=lapack,f77blas,cblas,atlas | |
echo search_static_first=true | |
echo runtime_library_dirs = /var/task/vendored/lib | |
echo extra_link_args = -lgfortran -lquadmath | |
) > ~/.numpy-site.cfg | |
shared_libs () { | |
mkdir -p /var/task/vendored/lib/ || true | |
#cp /usr/lib64/atlas-sse3/liblapack.so.3 /var/task/vendored/lib/ | |
#cp /usr/lib64/atlas-sse3/libptf77blas.so.3 /var/task/vendored/lib/ | |
#cp /usr/lib64/atlas-sse3/libf77blas.so.3 /var/task/vendored/lib/ | |
#cp /usr/lib64/atlas-sse3/libptcblas.so.3 /var/task/vendored/lib/ | |
#cp /usr/lib64/atlas-sse3/libcblas.so.3 /var/task/vendored/lib/ | |
#cp /usr/lib64/atlas-sse3/libatlas.so.3 /var/task/vendored/lib/ | |
cp /usr/lib64/atlas-sse3/* /var/task/vendored/lib/ | |
cp /usr/lib64/libgfortran.so.3 /var/task/vendored/lib/ | |
cp /usr/lib64/libquadmath.so.0 /var/task/vendored/lib/ | |
} | |
do_pip () { | |
pip install pip wheel virtualenv virtualenvwrapper -U | |
aws s3 cp "s3://dengine-api-python-stack/requirements.txt" . | |
pip install --no-binary :all: -r requirements.txt | |
} | |
strip_virtualenv () { | |
find $VIRTUAL_ENV/lib64/python2.7/site-packages/ -name "*.so" | xargs strip | |
} | |
make_package() { | |
pushd $VIRTUAL_ENV/lib/python2.7/site-packages/ && zip -r -9 -q ~/vendored.zip * ; popd | |
# The following 7 shared objects are required only. | |
# liblapack.so.3 | |
# libptf77blas.so.3 | |
# libf77blas.so.3 | |
# libptcblas.so.3 | |
# libcblas.so.3 | |
# libatlas.so.3 | |
# libgfortran.so.3 | |
# libquadmath.so.0 | |
rm -rf /var/task/vendored/lib/*.a | |
rm -rf /var/task/vendored/lib/*.so.3.0 | |
rm -rf /var/task/vendored/lib/*.so | |
rm -rf /var/task/vendored/lib/libclapack.so.3 | |
pushd /var/task/vendored/ && zip -r -9 -q ~/vendored.zip * ; popd | |
} | |
upload () { | |
inst_id=$(curl -s http://169.254.169.254/latest/meta-data/instance-id) | |
aws s3 cp ~/vendored.zip "s3://python-stack/vendored-${inst_id}.zip" | |
} | |
main () { | |
export PATH=/usr/local/bin/:$PATH | |
export NPY_NUM_BUILD_JOBS=2 | |
pip-2.7 install pip wheel virtualenv -U | |
virtualenv python-stack --distribute | |
source python-stack/bin/activate | |
shared_libs | |
do_pip | |
strip_virtualenv | |
make_package | |
upload | |
} | |
main |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
--- | |
- name: launch EC2 instance and build scikitlearn | |
hosts: localhost | |
connection: local | |
vars: | |
ssh_key: xxx-api | |
profile_name: ec2-write-demo-bucket | |
subnet_id: subnet-1111111 | |
tasks: | |
- name: Upload requirements.txt to s3 | |
s3: | |
bucket: python-stack | |
object: requirements.txt | |
src: requirements.txt | |
mode: put | |
register: result | |
until: result.failed is not defined or result.failed == false | |
retries: 10 | |
delay: 5 | |
- register: ectwo | |
ec2: | |
key_name: "{{ vars.ssh_key }}" | |
instance_type: c3.large | |
image: ami-60b6c60a | |
wait: yes | |
instance_profile_name: "{{ vars.profile_name }}" | |
user_data: "{{ lookup('file', 'build_vendored.sh') }}" | |
# networking biz | |
region: us-east-1 | |
vpc_subnet_id: "{{ vars.subnet_id }}" | |
assign_public_ip: yes | |
groups: | |
- stage-dengine-api | |
- name: Download vendored object from s3 | |
s3: | |
bucket: python-stack | |
object: "vendored-{{ ectwo.instance_ids[0] }}.zip" | |
dest: vendored.zip | |
mode: get | |
register: result | |
until: result.failed is not defined or result.failed == false | |
retries: 15 | |
delay: 90 | |
- name: Remove requirements.txt to s3 | |
s3: | |
bucket: python-stack | |
object: requirements.txt | |
mode: delobj | |
register: result | |
until: result.failed is not defined or result.failed == false | |
retries: 10 | |
delay: 5 |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
def remove_possible_duplicates(docs, threshold=0.8): | |
if not docs: | |
return docs | |
vect = TfidfVectorizer(min_df=1) | |
try: | |
tfidf = vect.fit_transform(docs) | |
except ValueError: | |
# avoid empty vocabulary | |
return docs | |
decision = numpy.tril((tfidf * tfidf.T).A > threshold, -1) | |
return numpy.array(docs)[decision.sum(1) == 0].tolist() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment