Skip to content

Instantly share code, notes, and snippets.

@elliottcordo
elliottcordo / emr-spark-pyspark-fix.sh
Created November 24, 2014 20:30
emr spark pyspark fix
unzip -d tmp1 spark/lib/spark-assembly-1.1.0-hadoop2.4.0.jar
cd tmp1
#run the line below assuming openjdk is not installed on your EMR cluster (it's probably not)
sudo yum install -y java-1.6.0-openjdk-devel.x86_64
/usr/lib/jvm/java-1.6.0-openjdk-1.6.0.33.x86_64/bin/jar cvmf META-INF/MANIFEST.MF ../spark/lib/spark-assembly-1.1.0-hadoop2.4.0.jar .
@elliottcordo
elliottcordo / emr-spark.sh
Created November 17, 2014 19:15
emr spark cluster
aws emr create-cluster --name SparkCluster --ami-version 3.2 --instance-type m3.xlarge --instance-count 3 --ec2-attributes KeyName=caserta-1 --applications Name=Hive --bootstrap-actions Path=s3://support.elasticmapreduce/spark/install-spark
@elliottcordo
elliottcordo / redshift_ntile.sql
Last active August 29, 2015 14:09
redshift ntile query
drop table zzt;
create temporary table zzt as
with n_tile as
(
select case when cnt>5 then 5 else cnt end as cnt
from
( select count(1)/50 as cnt
from temp.godaddy_viewing_summary_daily_visit) a
@elliottcordo
elliottcordo / yelp_pig_join.pig
Created October 28, 2014 04:00
yelp_pig_join
REGISTER 's3://caserta-bucket1/libs/elephant-bird-pig.jar'
REGISTER 's3://caserta-bucket1/libs/elephant-bird-core.jar'
REGISTER 's3://caserta-bucket1/libs/elephant-bird-hadoop-compat.jar'
REGISTER 's3://caserta-bucket1/libs/json-simple.jar'
business = LOAD 's3://caserta-bucket1/yelp-academic-dataset/yelp_academic_dataset_business.json'
USING com.twitter.elephantbird.pig.load.JsonLoader('-nestedLoad');
business_cleaned = FOREACH business
@elliottcordo
elliottcordo / yelp_pyspark_example.py
Last active August 29, 2015 14:08
yelp pyspark example
#MASTER=yarn-client /home/hadoop/spark/bin/pyspark
from pyspark.sql import SQLContext, Row
sqlContext = SQLContext(sc)
#------------------------------------------------
#load some users
lines=sc.textFile("s3://caserta-bucket1/yelp/in/users/users.txt")
parts = lines.map(lambda l: l.split(","))
# download MovieLens data
wget --output-document=data/ml-100k.zip http://www.grouplens.org/system/files/ml-100k.zip
wget --output-document=data/ml-1m.zip http://www.grouplens.org/system/files/ml-1m.zip
wget --output-document=data/ml-10m.zip http://files.grouplens.org/papers/ml-10m.zip
cd data
# unzip data
unzip ml-100k.zip
unzip ml-1m.zip
@elliottcordo
elliottcordo / sql_alchemy_schema_migration.py
Last active August 29, 2015 14:07
sql alchemy schema migration
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session, sessionmaker
from sqlalchemy import create_engine, MetaData, Table, schema, Text, Index, select, func
to_db="mysql+pymysql://root@localhost/test"
#to_db="postgres://admin:your_redshift_cluster/pwd"
from_db="mysql+pymysql://root@localhost/test"
def make_session(connection_string):
@elliottcordo
elliottcordo / hive_update_strategy.hql
Created September 17, 2014 14:00
basic hive update strategy with dynamic partitioning
/* sales.csv
pizza,10.50,1,20140901
golf balls,4.44,1,20140901
hair gel,5,1,20140902
cream puffs,1.24,1,20140908
*/
/* sales2.csv
apples,4,1,20140908
frogs,3,1,20140908
1. max surrogate key for dim
--drop table tmp_max_key_d_type2
create temporary table tmp_max_key_d_type2 as
select case when max(type2_key)is null then 1 else max(type2_key) end as max_key,
trunc(getdate())as created_date
from d_type2
2. last record in dim for selected natural key
create temp table temp_d_type2_old
as
@elliottcordo
elliottcordo / simple_redis_zinterstore
Created August 26, 2014 22:22
another simple redis example
zadd lb-102-dist 546.2 eswar 400 elliott 311.2 marie 555 neel
zadd lb-103-dist 511.333 eswar 200 elliott 132 sue 888.4 jill
zadd demo-women 1 jill 1 sue 1 marie
zadd demo-men 1 elliott 1 eswar 1 neel
zunionstore tmp-1 2 lb-103-dist lb-102-dist
zinterstore tmp-1 2 demo-women tmp-1 WEIGHTS 0 1
zrevrange tmp-1 0 2