robhudson · August 17, 2016 15:55
diff --git a/cdf.py b/cdf.py
 import random

 from pyspark.sql.window import Window
 from pyspark.sql.functions import cume_dist, row_number


 raw_data = [(random.randint(1, 100),) for i in range(100)]
 df = sqlContext.createDataFrame(raw_data, ['bin'])
 ws = Window.orderBy('bin')
 cdf = df.select(df['bin'], cume_dist().over(ws).alias('c'), row_number().over(ws).alias('i'))
 cdf = cdf.filter("i = 1 OR i % 5 = 0")
 """
 >>> cdf.show(21)
 +---+----+---+
 |bin|   c|  i|
 +---+----+---+
 |  1|0.02|  1|
 |  2|0.05|  5|
 | 10| 0.1| 10|
 | 16|0.15| 15|
 | 19| 0.2| 20|
 | 27|0.27| 25|
 | 30|0.32| 30|
 | 32|0.35| 35|
 | 35| 0.4| 40|
 | 43|0.46| 45|
 | 48|0.51| 50|
 | 50|0.56| 55|
 | 55|0.62| 60|
 | 59|0.65| 65|
 | 75|0.71| 70|
 | 77|0.75| 75|
 | 81| 0.8| 80|
 | 86|0.86| 85|
 | 88|0.91| 90|
 | 95|0.98| 95|
 |100| 1.0|100|
 +---+----+---+
 """
	import random

	from pyspark.sql.window import Window
	from pyspark.sql.functions import cume_dist, row_number


	raw_data = [(random.randint(1, 100),) for i in range(100)]
	df = sqlContext.createDataFrame(raw_data, ['bin'])
	ws = Window.orderBy('bin')
	cdf = df.select(df['bin'], cume_dist().over(ws).alias('c'), row_number().over(ws).alias('i'))
	cdf = cdf.filter("i = 1 OR i % 5 = 0")
	"""
	>>> cdf.show(21)
	+---+----+---+
	\|bin\| c\| i\|
	+---+----+---+
	\| 1\|0.02\| 1\|
	\| 2\|0.05\| 5\|
	\| 10\| 0.1\| 10\|
	\| 16\|0.15\| 15\|
	\| 19\| 0.2\| 20\|
	\| 27\|0.27\| 25\|
	\| 30\|0.32\| 30\|
	\| 32\|0.35\| 35\|
	\| 35\| 0.4\| 40\|
	\| 43\|0.46\| 45\|
	\| 48\|0.51\| 50\|
	\| 50\|0.56\| 55\|
	\| 55\|0.62\| 60\|
	\| 59\|0.65\| 65\|
	\| 75\|0.71\| 70\|
	\| 77\|0.75\| 75\|
	\| 81\| 0.8\| 80\|
	\| 86\|0.86\| 85\|
	\| 88\|0.91\| 90\|
	\| 95\|0.98\| 95\|
	\|100\| 1.0\|100\|
	+---+----+---+
	"""