MLWhiz · July 3, 2019 16:17
diff --git a/most_rated.py b/most_rated.py
 # Create a RDD from RatingRDD that only contains the two columns of interest i.e. movie_id,rating.
 RDD_movid_rating = ratingRDD.map(lambda x : (x.split("\t")[1],x.split("\t")[2]))
 print("RDD_movid_rating:",RDD_movid_rating.take(4))

 # Create a RDD from MovieRDD that only contains the two columns of interest i.e. movie_id,title.
 RDD_movid_title = movieRDD.map(lambda x : (x.split("|")[0],x.split("|")[1]))
 print("RDD_movid_title:",RDD_movid_title.take(2))

 # merge these two pair RDDs based on movie_id. For this we will use the transformation leftOuterJoin(). See the transformation document.
 rdd_movid_title_rating = RDD_movid_rating.leftOuterJoin(RDD_movid_title)
 print("rdd_movid_title_rating:",rdd_movid_title_rating.take(1))

 # use the RDD in previous step to create (movie,1) tuple pair RDD
 rdd_title_rating = rdd_movid_title_rating.map(lambda x: (x[1][1],1 ))
 print("rdd_title_rating:",rdd_title_rating.take(2))

 # Use the reduceByKey transformation to reduce on the basis of movie_title
 rdd_title_ratingcnt = rdd_title_rating.reduceByKey(lambda x,y: x+y)
 print("rdd_title_ratingcnt:",rdd_title_ratingcnt.take(2))

 # Get the final answer by using takeOrdered Transformation
 print "#####################################"
 print "25 most rated movies:",rdd_title_ratingcnt.takeOrdered(25,lambda x:-x[1])
 print "#####################################"
	# Create a RDD from RatingRDD that only contains the two columns of interest i.e. movie_id,rating.
	RDD_movid_rating = ratingRDD.map(lambda x : (x.split("\t")[1],x.split("\t")[2]))
	print("RDD_movid_rating:",RDD_movid_rating.take(4))

	# Create a RDD from MovieRDD that only contains the two columns of interest i.e. movie_id,title.
	RDD_movid_title = movieRDD.map(lambda x : (x.split("\|")[0],x.split("\|")[1]))
	print("RDD_movid_title:",RDD_movid_title.take(2))

	# merge these two pair RDDs based on movie_id. For this we will use the transformation leftOuterJoin(). See the transformation document.
	rdd_movid_title_rating = RDD_movid_rating.leftOuterJoin(RDD_movid_title)
	print("rdd_movid_title_rating:",rdd_movid_title_rating.take(1))

	# use the RDD in previous step to create (movie,1) tuple pair RDD
	rdd_title_rating = rdd_movid_title_rating.map(lambda x: (x[1][1],1 ))
	print("rdd_title_rating:",rdd_title_rating.take(2))

	# Use the reduceByKey transformation to reduce on the basis of movie_title
	rdd_title_ratingcnt = rdd_title_rating.reduceByKey(lambda x,y: x+y)
	print("rdd_title_ratingcnt:",rdd_title_ratingcnt.take(2))

	# Get the final answer by using takeOrdered Transformation
	print "#####################################"
	print "25 most rated movies:",rdd_title_ratingcnt.takeOrdered(25,lambda x:-x[1])
	print "#####################################"