Skip to content

Instantly share code, notes, and snippets.

@dgadiraju
Created March 14, 2017 05:56
Show Gist options
  • Select an option

  • Save dgadiraju/7c33be016d50efbc954c376df8cee24c to your computer and use it in GitHub Desktop.

Select an option

Save dgadiraju/7c33be016d50efbc954c376df8cee24c to your computer and use it in GitHub Desktop.
def getTopDenseN(rec, topN):
topNPricedProducts = [ ]
topNPrices = [ ]
prodPrices = [ ]
prodPricesDesc = [ ]
#10 records in rec
for i in rec:
prodPrices.append(float(i.split(",")[4]))
#prodPrices will have only prices from the 10 records
prodPricesDesc = list(sorted(set(prodPrices), reverse=True))
#prodPricesDesc will have all unique product prices in descending order
import itertools
topNPrices = list(itertools.islice(prodPricesDesc, 0, topN))
#topNPrices will have unique topN prices
for j in sorted(rec, key=lambda k: float(k.split(",")[4]), reverse=True):
if(float(j.split(",")[4]) in topNPrices):
topNPricedProducts.append(j)
#topNPricedProducts will have all the products which have the price matching one of topNPrices
#simulates dense rank functionality
return (y for y in topNPricedProducts)
products = sc.textFile("/public/retail_db/products")
productsFiltered = products.filter(lambda rec: rec.split(",")[4] != "")
for i in productsFiltered.\
map(lambda rec: (int(rec.split(",")[1]), rec)).\
groupByKey().\
flatMap(lambda rec: getTopDenseN(rec[1], 5)).\
collect():
print(i)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment