elliottcordo

the close channel is derived in the following sub query

left outer join 
    ( select distinct new_opportunityid
      from stg.v_contractactions ca 
        join stg.v_tbltransmaster tm on ca.vcmembertransnumber=tm.vctransactionnumber
          and ca.source_system_id=tm.source_system_id
        join stg.esp_new_contractactions eca on ca.icontractid=eca.new_name
        join stg.bs_tbl_facility b on b.code=eca.new_vchomefacilityid
           and b.companyid=ca.company_id

###create an emr cluster on latest spark version --> should be fine if you don't want to use hive, or parquet

aws emr create-cluster --name SparkCluster --ami-version 3.2.1 --instance-type m3.xlarge --instance-count 3 --ec2-attributes KeyName=<your key homey> --applications Name=Hive --bootstrap-actions Path=s3://support.elasticmapreduce/spark/install-spark,Args=\["-v1.2.0.a"\]

###k.. now u have a cluster - do some slicing and dicing thorugh pyspark client mode is fine but be sure to start in a screen, you will also need to play with parameters

./spark/bin/pyspark --master yarn --deploy-mode client --num-executors 12 --executor-memory 2g --executor-cores 4

	create index ix_customer_first_last on mstr.d_customer(first_name, last_name);

	create index ix_email on mstr.d_customer(email);

	drop table cust_matches;

	--1. first name matches
	create temporary table cust_matches as
	select c.customer_key as customer_key_1, c.first_name as first_name_1,c.last_name as last_name_1, c.email as email_1,
	c1.customer_key as customer_key_2, c1.first_name as first_name_2, c1.last_name as last_name_2, c1.email as email_2,

	import beatbox
	import csv
	svc = beatbox.PythonClient()
	svc.login('[email protected]', '*********pHCt4OfKqxtGlnLXM25TotCRv')
	records = svc.query("SELECT Id, FirstName, LastName FROM Contact")

	with open('contacts.csv', 'w') as outfile:
	fp = csv.DictWriter(outfile, records[0].keys())
	fp.writeheader()
	fp.writerows(records)

	--create some non existent relationships
	MATCH (m:CONTACT {name: "leslie"}), (k:CONTACT {name: "greg"}) MERGE (k)-[r:KNOWS]-(m) ;
	MATCH (m:CONTACT {name: "elliott"}), (k:CONTACT {name: "greg"}) MERGE (k)-[r:FOLLOWS]->(m) ON CREATE SET r.since = ["2014-01-01"];

	--delete my connection to amex
	MATCH (m:CONTACT {name: "elliott"})-[r]-(k {name: "American Express Company"}) delete r

	--find 2nd level connections
	match (elliott {name: "elliott"})-[r*0..2]-()
	return *

	import gspread
	gc = gspread.login('[email protected]','xxx')
	wks = gc.open("wanderlust categories").get_worksheet(0)
	list_of_lists = wks.get_all_values()
	for l in list_of_lists:
	print l

	#on cluster
	thrift /spark/sbin/start-thriftserver.sh --master yarn-client
	#ssh tunnel, direct 10000 to unused 8157
	ssh -i ~/caserta-1.pem -N -L 8157:ec2-54-221-27-21.compute-1.amazonaws.com:10000 [email protected]
	#see this for JDBC config on client http://blogs.aws.amazon.com/bigdata/post/TxT7CJ0E7CRX88/Using-Amazon-EMR-with-SQL-Workbench-and-other-BI-Tools

	#spark scheduler
	/home/hadoop/spark/bin/pyspark --master spark://ip-10-63-51-140.ec2.internal:7077

	cp /home/hadoop/hive/conf/hive-default.xml /home/hadoop/spark/conf/hive-site.xml
	sed -i 's/SPARK_CLASSPATH=\"/&\/home\/hadoop\/hive\/lib\/bonecp-0.8.0.RELEASE.jar:\/home\/hadoop\/hive\/lib\/mysql-connector-java-5.1.30.jar:/' /home/hadoop/spark/conf/spark-env.sh

	aws emr create-cluster --name SparkCluster --ami-version 3.2.1 --instance-type m3.xlarge --instance-count 3 --ec2-attributes KeyName=caserta-1 --applications Name=Hive --bootstrap-actions Path=s3://support.elasticmapreduce/spark/install-spark,Args=\["-v1.1.0.d"\]