dgadiraju · March 14, 2017 05:56
diff --git a/pyspark-topnpricesproducts.py b/pyspark-topnpricesproducts.py
 def getTopDenseN(rec, topN):
  topNPricedProducts = [ ]
  topNPrices = [ ]
  prodPrices = [ ]
  prodPricesDesc = [ ]
  #10 records in rec
  for i in rec:
    prodPrices.append(float(i.split(",")[4]))
  #prodPrices will have only prices from the 10 records
  prodPricesDesc = list(sorted(set(prodPrices), reverse=True))
  #prodPricesDesc will have all unique product prices in descending order
  import itertools
  topNPrices = list(itertools.islice(prodPricesDesc, 0, topN))
  #topNPrices will have unique topN prices
  for j in sorted(rec, key=lambda k: float(k.split(",")[4]), reverse=True):
    if(float(j.split(",")[4]) in topNPrices):
      topNPricedProducts.append(j)
  #topNPricedProducts will have all the products which have the price matching one of topNPrices
  #simulates dense rank functionality
  return (y for y in topNPricedProducts)

 products = sc.textFile("/public/retail_db/products")
 productsFiltered = products.filter(lambda rec: rec.split(",")[4] != "")

 for i in productsFiltered.\
 map(lambda rec: (int(rec.split(",")[1]), rec)).\
 groupByKey().\
 flatMap(lambda rec: getTopDenseN(rec[1], 5)).\
 collect():
  print(i)
	def getTopDenseN(rec, topN):
	topNPricedProducts = [ ]
	topNPrices = [ ]
	prodPrices = [ ]
	prodPricesDesc = [ ]
	#10 records in rec
	for i in rec:
	prodPrices.append(float(i.split(",")[4]))
	#prodPrices will have only prices from the 10 records
	prodPricesDesc = list(sorted(set(prodPrices), reverse=True))
	#prodPricesDesc will have all unique product prices in descending order
	import itertools
	topNPrices = list(itertools.islice(prodPricesDesc, 0, topN))
	#topNPrices will have unique topN prices
	for j in sorted(rec, key=lambda k: float(k.split(",")[4]), reverse=True):
	if(float(j.split(",")[4]) in topNPrices):
	topNPricedProducts.append(j)
	#topNPricedProducts will have all the products which have the price matching one of topNPrices
	#simulates dense rank functionality
	return (y for y in topNPricedProducts)

	products = sc.textFile("/public/retail_db/products")
	productsFiltered = products.filter(lambda rec: rec.split(",")[4] != "")

	for i in productsFiltered.\
	map(lambda rec: (int(rec.split(",")[1]), rec)).\
	groupByKey().\
	flatMap(lambda rec: getTopDenseN(rec[1], 5)).\
	collect():
	print(i)
No results found