elliottcordo’s gists

elliottcordo / emr-spark-pyspark-fix.sh

Created November 24, 2014 20:30

emr spark pyspark fix

	unzip -d tmp1 spark/lib/spark-assembly-1.1.0-hadoop2.4.0.jar
	cd tmp1
	#run the line below assuming openjdk is not installed on your EMR cluster (it's probably not)
	sudo yum install -y java-1.6.0-openjdk-devel.x86_64
	/usr/lib/jvm/java-1.6.0-openjdk-1.6.0.33.x86_64/bin/jar cvmf META-INF/MANIFEST.MF ../spark/lib/spark-assembly-1.1.0-hadoop2.4.0.jar .

elliottcordo / emr-spark.sh

Created November 17, 2014 19:15

emr spark cluster

aws emr create-cluster --name SparkCluster --ami-version 3.2 --instance-type m3.xlarge --instance-count 3 --ec2-attributes KeyName=caserta-1 --applications Name=Hive --bootstrap-actions Path=s3://support.elasticmapreduce/spark/install-spark

elliottcordo / redshift_ntile.sql

Last active August 29, 2015 14:09

redshift ntile query

	drop table zzt;

	create temporary table zzt as

	with n_tile as
	(
	select case when cnt>5 then 5 else cnt end as cnt
	from
	( select count(1)/50 as cnt
	from temp.godaddy_viewing_summary_daily_visit) a

elliottcordo / yelp_pig_join.pig

Created October 28, 2014 04:00

yelp_pig_join

	REGISTER 's3://caserta-bucket1/libs/elephant-bird-pig.jar'
	REGISTER 's3://caserta-bucket1/libs/elephant-bird-core.jar'
	REGISTER 's3://caserta-bucket1/libs/elephant-bird-hadoop-compat.jar'
	REGISTER 's3://caserta-bucket1/libs/json-simple.jar'


	business = LOAD 's3://caserta-bucket1/yelp-academic-dataset/yelp_academic_dataset_business.json'
	USING com.twitter.elephantbird.pig.load.JsonLoader('-nestedLoad');

	business_cleaned = FOREACH business

elliottcordo / yelp_pyspark_example.py

Last active August 29, 2015 14:08

yelp pyspark example

	#MASTER=yarn-client /home/hadoop/spark/bin/pyspark

	from pyspark.sql import SQLContext, Row
	sqlContext = SQLContext(sc)

	#------------------------------------------------
	#load some users
	lines=sc.textFile("s3://caserta-bucket1/yelp/in/users/users.txt")

	parts = lines.map(lambda l: l.split(","))

elliottcordo / gist:21d76e54e111ffd686dc

Created October 21, 2014 22:04

wget movie lense

	# download MovieLens data
	wget --output-document=data/ml-100k.zip http://www.grouplens.org/system/files/ml-100k.zip
	wget --output-document=data/ml-1m.zip http://www.grouplens.org/system/files/ml-1m.zip
	wget --output-document=data/ml-10m.zip http://files.grouplens.org/papers/ml-10m.zip

	cd data

	# unzip data
	unzip ml-100k.zip
	unzip ml-1m.zip

elliottcordo / sql_alchemy_schema_migration.py

Last active August 29, 2015 14:07

sql alchemy schema migration

	from sqlalchemy.ext.automap import automap_base
	from sqlalchemy.orm import Session, sessionmaker
	from sqlalchemy import create_engine, MetaData, Table, schema, Text, Index, select, func


	to_db="mysql+pymysql://root@localhost/test"
	#to_db="postgres://admin:your_redshift_cluster/pwd"
	from_db="mysql+pymysql://root@localhost/test"

	def make_session(connection_string):

elliottcordo / hive_update_strategy.hql

Created September 17, 2014 14:00

basic hive update strategy with dynamic partitioning

	/* sales.csv
	pizza,10.50,1,20140901
	golf balls,4.44,1,20140901
	hair gel,5,1,20140902
	cream puffs,1.24,1,20140908
	*/

	/* sales2.csv
	apples,4,1,20140908
	frogs,3,1,20140908

elliottcordo / gist:6d1abc9c8ee5547c73a0

Created September 11, 2014 01:14

sample type 2

	1. max surrogate key for dim
	--drop table tmp_max_key_d_type2
	create temporary table tmp_max_key_d_type2 as
	select case when max(type2_key)is null then 1 else max(type2_key) end as max_key,
	trunc(getdate())as created_date
	from d_type2

	2. last record in dim for selected natural key
	create temp table temp_d_type2_old
	as

elliottcordo / simple_redis_zinterstore

Created August 26, 2014 22:22

another simple redis example


	zadd lb-102-dist 546.2 eswar 400 elliott 311.2 marie 555 neel
	zadd lb-103-dist 511.333 eswar 200 elliott 132 sue 888.4 jill

	zadd demo-women 1 jill 1 sue 1 marie
	zadd demo-men 1 elliott 1 eswar 1 neel

	zunionstore tmp-1 2 lb-103-dist lb-102-dist
	zinterstore tmp-1 2 demo-women tmp-1 WEIGHTS 0 1
	zrevrange tmp-1 0 2