This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
unzip -d tmp1 spark/lib/spark-assembly-1.1.0-hadoop2.4.0.jar | |
cd tmp1 | |
#run the line below assuming openjdk is not installed on your EMR cluster (it's probably not) | |
sudo yum install -y java-1.6.0-openjdk-devel.x86_64 | |
/usr/lib/jvm/java-1.6.0-openjdk-1.6.0.33.x86_64/bin/jar cvmf META-INF/MANIFEST.MF ../spark/lib/spark-assembly-1.1.0-hadoop2.4.0.jar . |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
aws emr create-cluster --name SparkCluster --ami-version 3.2 --instance-type m3.xlarge --instance-count 3 --ec2-attributes KeyName=caserta-1 --applications Name=Hive --bootstrap-actions Path=s3://support.elasticmapreduce/spark/install-spark |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
drop table zzt; | |
create temporary table zzt as | |
with n_tile as | |
( | |
select case when cnt>5 then 5 else cnt end as cnt | |
from | |
( select count(1)/50 as cnt | |
from temp.godaddy_viewing_summary_daily_visit) a |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
REGISTER 's3://caserta-bucket1/libs/elephant-bird-pig.jar' | |
REGISTER 's3://caserta-bucket1/libs/elephant-bird-core.jar' | |
REGISTER 's3://caserta-bucket1/libs/elephant-bird-hadoop-compat.jar' | |
REGISTER 's3://caserta-bucket1/libs/json-simple.jar' | |
business = LOAD 's3://caserta-bucket1/yelp-academic-dataset/yelp_academic_dataset_business.json' | |
USING com.twitter.elephantbird.pig.load.JsonLoader('-nestedLoad'); | |
business_cleaned = FOREACH business |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#MASTER=yarn-client /home/hadoop/spark/bin/pyspark | |
from pyspark.sql import SQLContext, Row | |
sqlContext = SQLContext(sc) | |
#------------------------------------------------ | |
#load some users | |
lines=sc.textFile("s3://caserta-bucket1/yelp/in/users/users.txt") | |
parts = lines.map(lambda l: l.split(",")) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# download MovieLens data | |
wget --output-document=data/ml-100k.zip http://www.grouplens.org/system/files/ml-100k.zip | |
wget --output-document=data/ml-1m.zip http://www.grouplens.org/system/files/ml-1m.zip | |
wget --output-document=data/ml-10m.zip http://files.grouplens.org/papers/ml-10m.zip | |
cd data | |
# unzip data | |
unzip ml-100k.zip | |
unzip ml-1m.zip |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from sqlalchemy.ext.automap import automap_base | |
from sqlalchemy.orm import Session, sessionmaker | |
from sqlalchemy import create_engine, MetaData, Table, schema, Text, Index, select, func | |
to_db="mysql+pymysql://root@localhost/test" | |
#to_db="postgres://admin:your_redshift_cluster/pwd" | |
from_db="mysql+pymysql://root@localhost/test" | |
def make_session(connection_string): |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* sales.csv | |
pizza,10.50,1,20140901 | |
golf balls,4.44,1,20140901 | |
hair gel,5,1,20140902 | |
cream puffs,1.24,1,20140908 | |
*/ | |
/* sales2.csv | |
apples,4,1,20140908 | |
frogs,3,1,20140908 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
1. max surrogate key for dim | |
--drop table tmp_max_key_d_type2 | |
create temporary table tmp_max_key_d_type2 as | |
select case when max(type2_key)is null then 1 else max(type2_key) end as max_key, | |
trunc(getdate())as created_date | |
from d_type2 | |
2. last record in dim for selected natural key | |
create temp table temp_d_type2_old | |
as |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
zadd lb-102-dist 546.2 eswar 400 elliott 311.2 marie 555 neel | |
zadd lb-103-dist 511.333 eswar 200 elliott 132 sue 888.4 jill | |
zadd demo-women 1 jill 1 sue 1 marie | |
zadd demo-men 1 elliott 1 eswar 1 neel | |
zunionstore tmp-1 2 lb-103-dist lb-102-dist | |
zinterstore tmp-1 2 demo-women tmp-1 WEIGHTS 0 1 | |
zrevrange tmp-1 0 2 |