AcidLeroy · November 15, 2015 03:48
diff --git a/find_max_foreach_user.py b/find_max_foreach_user.py
 from pyspark import SparkContext, SparkConf

 def reduce_by_max(rdd):
    """
    Helper function to find the max value in a list of values i.e. triplets. 
    """
    max_val = rdd[0][2]
    the_index = 0

    for idx, val in enumerate(rdd):
        if val[2] > max_val:
            max_val = val[2]
            the_index = idx


    return rdd[the_index]

 conf = SparkConf() \
    .setAppName("Collaborative Filter") \
    .set("spark.executor.memory", "5g")
 sc = SparkContext(conf=conf)

 # some file contains tuples ('user', 'item', 'occurrences')
 data_file = sc.textData('file:///some_file.txt')

 # Create the triplet so I can index stuff
 data_file = data_file.map(lambda l: l.split()).map(lambda l: (l[0], l[1], float(l[2])))

 # Group by the user i.e. r[0]
 grouped = data_file.groupBy(lambda r: r[0])

 # Get the values as a list
 group_list = grouped.map(lambda x: (list(x[1]))) 

 # Get the max value for each user. 
 max_list = group_list.map(reduce_by_max).collect()
	from pyspark import SparkContext, SparkConf

	def reduce_by_max(rdd):
	"""
	Helper function to find the max value in a list of values i.e. triplets.
	"""
	max_val = rdd[0][2]
	the_index = 0

	for idx, val in enumerate(rdd):
	if val[2] > max_val:
	max_val = val[2]
	the_index = idx


	return rdd[the_index]

	conf = SparkConf() \
	.setAppName("Collaborative Filter") \
	.set("spark.executor.memory", "5g")
	sc = SparkContext(conf=conf)

	# some file contains tuples ('user', 'item', 'occurrences')
	data_file = sc.textData('file:///some_file.txt')

	# Create the triplet so I can index stuff
	data_file = data_file.map(lambda l: l.split()).map(lambda l: (l[0], l[1], float(l[2])))

	# Group by the user i.e. r[0]
	grouped = data_file.groupBy(lambda r: r[0])

	# Get the values as a list
	group_list = grouped.map(lambda x: (list(x[1])))

	# Get the max value for each user.
	max_list = group_list.map(reduce_by_max).collect()