fran0x · September 26, 2017 12:49 · farzad7186 · May 18, 2017
diff --git a/yelp_pyspark_df.py b/yelp_pyspark_df.py
 # copy the Hive configuration file hive-site.xml to the spark configuration folder
 # sudo cp /etc/hive/conf.dist/hive-site.xml /usr/lib/spark/conf/

 # launch pyspark with the spark-csv package (note: version 1.2.0 has some issues thus better use 1.3.0)
 # PYSPARK_DRIVER_PYTHON=ipython pyspark --packages com.databricks:spark-csv_2.10:1.3.0

 # check dataframes are working
 sqlCtx.createDataFrame([("somekey", 1)])

 # load yelp dataset
 yelp_df=sqlCtx.load(source='com.databricks.spark.csv',header='true',inferSchema='true',path='file:////usr/lib/hue/apps/search/examples/collections/solr_configs_yelp_demo/index_data.csv')

 # 1) calculate mean of the "cool" column across all of the dataset
 yelp_df.select('cool').agg({'cool':'mean'}).collect()

 # 2) calculate average of the "cool" column for venues with 4 "stars"
 #    for records with a "review count" of 10 or more
 yelp_df.filter(yelp_df.review_count >= 10).groupBy(yelp_df.stars).avg('cool').show()

 # 3) calculate average of the "cool" column for venues with 5 "stars"
 #    for records with a "review count" of 10 or more and records for which the venue is still open
 yelp_df.filter(yelp_df.review_count >= 10).filter(yelp_df.open == 'True').groupBy(yelp_df.stars).avg('cool').show()

 # 4) calculate state with 3rd highest number of reviews for venues with 5 "stars"
 #    for records with a "review count" of 10 or more and records for which the venue is still open
 from pyspark.sql.functions import asc,desc
 yelp_df.filter(yelp_df.review_count >= 10).filter(yelp_df.open == 'True').groupBy(yelp_df.state).sum('review_count').orderBy(desc('SUM(review_count#16)')).show()

 # 5) calculate maximum number of "reviews" per venue for the whole dataset
 yelp_df.groupBy(yelp_df.business_id, yelp_df.review_count).count().orderBy('count').collect()
	# copy the Hive configuration file hive-site.xml to the spark configuration folder
	# sudo cp /etc/hive/conf.dist/hive-site.xml /usr/lib/spark/conf/

	# launch pyspark with the spark-csv package (note: version 1.2.0 has some issues thus better use 1.3.0)
	# PYSPARK_DRIVER_PYTHON=ipython pyspark --packages com.databricks:spark-csv_2.10:1.3.0

	# check dataframes are working
	sqlCtx.createDataFrame([("somekey", 1)])

	# load yelp dataset
	yelp_df=sqlCtx.load(source='com.databricks.spark.csv',header='true',inferSchema='true',path='file:////usr/lib/hue/apps/search/examples/collections/solr_configs_yelp_demo/index_data.csv')

	# 1) calculate mean of the "cool" column across all of the dataset
	yelp_df.select('cool').agg({'cool':'mean'}).collect()

	# 2) calculate average of the "cool" column for venues with 4 "stars"
	# for records with a "review count" of 10 or more
	yelp_df.filter(yelp_df.review_count >= 10).groupBy(yelp_df.stars).avg('cool').show()

	# 3) calculate average of the "cool" column for venues with 5 "stars"
	# for records with a "review count" of 10 or more and records for which the venue is still open
	yelp_df.filter(yelp_df.review_count >= 10).filter(yelp_df.open == 'True').groupBy(yelp_df.stars).avg('cool').show()

	# 4) calculate state with 3rd highest number of reviews for venues with 5 "stars"
	# for records with a "review count" of 10 or more and records for which the venue is still open
	from pyspark.sql.functions import asc,desc
	yelp_df.filter(yelp_df.review_count >= 10).filter(yelp_df.open == 'True').groupBy(yelp_df.state).sum('review_count').orderBy(desc('SUM(review_count#16)')).show()

	# 5) calculate maximum number of "reviews" per venue for the whole dataset
	yelp_df.groupBy(yelp_df.business_id, yelp_df.review_count).count().orderBy('count').collect()