Skip to content

Instantly share code, notes, and snippets.

@sangheestyle
Last active December 7, 2016 05:50
Show Gist options
  • Save sangheestyle/5529bc664098a7f4e3544ade2919988b to your computer and use it in GitHub Desktop.
Save sangheestyle/5529bc664098a7f4e3544ade2919988b to your computer and use it in GitHub Desktop.
Create python dependency directory for using scikit-learn on AWS lambda Raw
#!/bin/bash
set -ex
set -o pipefail
yum update -y
yum install -y \
atlas-sse3-devel \
blas-devel \
gcc \
gcc-c++ \
lapack-devel \
python27-devel \
python-pip \
python-wheel
cd ~
# numpy-site.cfg is required to use custom runtime_library_dirs
# https://github.com/numpy/numpy/blob/master/site.cfg.example
(
echo [atlas]
echo libraries=lapack,f77blas,cblas,atlas
echo search_static_first=true
echo runtime_library_dirs = /var/task/vendored/lib
echo extra_link_args = -lgfortran -lquadmath
) > ~/.numpy-site.cfg
shared_libs () {
mkdir -p /var/task/vendored/lib/ || true
#cp /usr/lib64/atlas-sse3/liblapack.so.3 /var/task/vendored/lib/
#cp /usr/lib64/atlas-sse3/libptf77blas.so.3 /var/task/vendored/lib/
#cp /usr/lib64/atlas-sse3/libf77blas.so.3 /var/task/vendored/lib/
#cp /usr/lib64/atlas-sse3/libptcblas.so.3 /var/task/vendored/lib/
#cp /usr/lib64/atlas-sse3/libcblas.so.3 /var/task/vendored/lib/
#cp /usr/lib64/atlas-sse3/libatlas.so.3 /var/task/vendored/lib/
cp /usr/lib64/atlas-sse3/* /var/task/vendored/lib/
cp /usr/lib64/libgfortran.so.3 /var/task/vendored/lib/
cp /usr/lib64/libquadmath.so.0 /var/task/vendored/lib/
}
do_pip () {
pip install pip wheel virtualenv virtualenvwrapper -U
aws s3 cp "s3://dengine-api-python-stack/requirements.txt" .
pip install --no-binary :all: -r requirements.txt
}
strip_virtualenv () {
find $VIRTUAL_ENV/lib64/python2.7/site-packages/ -name "*.so" | xargs strip
}
make_package() {
pushd $VIRTUAL_ENV/lib/python2.7/site-packages/ && zip -r -9 -q ~/vendored.zip * ; popd
# The following 7 shared objects are required only.
# liblapack.so.3
# libptf77blas.so.3
# libf77blas.so.3
# libptcblas.so.3
# libcblas.so.3
# libatlas.so.3
# libgfortran.so.3
# libquadmath.so.0
rm -rf /var/task/vendored/lib/*.a
rm -rf /var/task/vendored/lib/*.so.3.0
rm -rf /var/task/vendored/lib/*.so
rm -rf /var/task/vendored/lib/libclapack.so.3
pushd /var/task/vendored/ && zip -r -9 -q ~/vendored.zip * ; popd
}
upload () {
inst_id=$(curl -s http://169.254.169.254/latest/meta-data/instance-id)
aws s3 cp ~/vendored.zip "s3://python-stack/vendored-${inst_id}.zip"
}
main () {
export PATH=/usr/local/bin/:$PATH
export NPY_NUM_BUILD_JOBS=2
pip-2.7 install pip wheel virtualenv -U
virtualenv python-stack --distribute
source python-stack/bin/activate
shared_libs
do_pip
strip_virtualenv
make_package
upload
}
main
---
- name: launch EC2 instance and build scikitlearn
hosts: localhost
connection: local
vars:
ssh_key: xxx-api
profile_name: ec2-write-demo-bucket
subnet_id: subnet-1111111
tasks:
- name: Upload requirements.txt to s3
s3:
bucket: python-stack
object: requirements.txt
src: requirements.txt
mode: put
register: result
until: result.failed is not defined or result.failed == false
retries: 10
delay: 5
- register: ectwo
ec2:
key_name: "{{ vars.ssh_key }}"
instance_type: c3.large
image: ami-60b6c60a
wait: yes
instance_profile_name: "{{ vars.profile_name }}"
user_data: "{{ lookup('file', 'build_vendored.sh') }}"
# networking biz
region: us-east-1
vpc_subnet_id: "{{ vars.subnet_id }}"
assign_public_ip: yes
groups:
- stage-dengine-api
- name: Download vendored object from s3
s3:
bucket: python-stack
object: "vendored-{{ ectwo.instance_ids[0] }}.zip"
dest: vendored.zip
mode: get
register: result
until: result.failed is not defined or result.failed == false
retries: 15
delay: 90
- name: Remove requirements.txt to s3
s3:
bucket: python-stack
object: requirements.txt
mode: delobj
register: result
until: result.failed is not defined or result.failed == false
retries: 10
delay: 5
import numpy
from sklearn.feature_extraction.text import TfidfVectorizer
def remove_possible_duplicates(docs, threshold=0.8):
if not docs:
return docs
vect = TfidfVectorizer(min_df=1)
try:
tfidf = vect.fit_transform(docs)
except ValueError:
# avoid empty vocabulary
return docs
decision = numpy.tril((tfidf * tfidf.T).A > threshold, -1)
return numpy.array(docs)[decision.sum(1) == 0].tolist()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment