Skip to content

Instantly share code, notes, and snippets.

View datitran's full-sized avatar

Dat Tran datitran

View GitHub Profile
@datitran
datitran / sklearn-pyspark.py
Created November 15, 2015 19:53 — forked from 0asa/sklearn-pyspark.py
Run a Scikit-Learn algorithm on top of Spark with PySpark
from pyspark import SparkConf, SparkContext
from sklearn.datasets import make_classification
from sklearn.ensemble import ExtraTreesClassifier
import pandas as pd
import numpy as np
conf = (SparkConf()
.setMaster("local[*]")
.setAppName("My app")
.set("spark.executor.memory", "1g"))
@datitran
datitran / moshInstall2AmazonAMI.sh
Created February 22, 2016 16:40 — forked from pesblog/moshInstall2AmazonAMI.sh
mosh install to AWS EC2 (Amazon AMI)
#!/bin/sh
sudo yum -y install autoconf automake gcc gcc-c++ make boost-devel zlib-devel ncurses-devel protobuf-devel openssl-devel
cd /usr/local/src
sudo wget http://mosh.mit.edu/mosh-1.2.4.tar.gz
sudo tar xvf mosh-1.2.4.tar.gz
cd mosh-1.2.4
sudo ./autogen.sh
sudo ./configure
sudo make
class EMRLoader(object):
def __init__(self, aws_access_key, aws_secret_access_key, region_name, cluster_name, instance_count, key_name,
log_uri, software_version, script_bucket_name):
self.instance_count = instance_count
self.key_name = key_name
self.cluster_name = cluster_name
self.aws_access_key = aws_access_key
self.aws_secret_access_key = aws_secret_access_key
self.region_name = region_name
self.log_uri = log_uri
def emr_client(self):
client = boto3.client("emr",
aws_access_key_id=self.aws_access_key,
aws_secret_access_key=self.aws_secret_access_key,
region_name=self.region_name)
return client
def load_cluster(self):
response = self.boto_client("emr").run_job_flow(
Name=self.cluster_name,
LogUri=self.log_uri,
ReleaseLabel=self.software_version,
Instances={
'MasterInstanceType': 'm3.xlarge',
'SlaveInstanceType': 'm3.xlarge',
'InstanceCount': self.instance_count,
'KeepJobFlowAliveWhenNoSteps': True,
def add_step(self, job_flow_id, master_dns):
response = self.boto_client("emr").add_job_flow_steps(
JobFlowId=job_flow_id,
Steps=[
{
'Name': 'setup - copy files',
'ActionOnFailure': 'CANCEL_AND_WAIT',
'HadoopJarStep': {
'Jar': 'command-runner.jar',
'Args': ['aws', 's3', 'cp',
#!/usr/bin/env bash
# install conda
wget --quiet https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O ~/miniconda.sh \
&& /bin/bash ~/miniconda.sh -b -p $HOME/conda
echo -e '\nexport PATH=$HOME/conda/bin:$PATH' >> $HOME/.bashrc && source $HOME/.bashrc
# install packages
conda install -y ipython jupyter
#!/usr/bin/env bash
# bind conda to spark
echo -e "\nexport PYSPARK_PYTHON=/home/hadoop/conda/bin/python" >> /etc/spark/conf/spark-env.sh
echo "export PYSPARK_DRIVER_PYTHON=/home/hadoop/conda/bin/jupyter" >> /etc/spark/conf/spark-env.sh
echo "export PYSPARK_DRIVER_PYTHON_OPTS='notebook --no-browser --ip=$1'" >> /etc/spark/conf/spark-env.sh
emr:
aws_access_key: <AWS Access Key>
aws_secret_access_key: <AWS Secret Access Key>
region_name: <Region Name>
cluster_name: <Cluster Name>
instance_count: <Number of Instances>
key_name: <Ec2 Keyname>
log_uri: s3://<Your S3 Location to the Logs>
software_version: <emr-5.x.x>
script_bucket_name: <Name of Bucket to be created on S3>