KobaKhit · August 27, 2020 23:59
diff --git a/repartition_pyspark_dataframe.py b/repartition_pyspark_dataframe.py
 from pyspark.sql.functions import monotonically_increasing_id, row_number
 from pyspark.sql import Window
 from functools import reduce

 def partitionIt(size, num):
    '''
    Create a list of partition indices each of size num where number of groups is ceiling(len(seq)/num)
    
    Args:
        size (int): number of rows/elemets 
        num (int) : number of elements in partition
        
    Return:
        a list with partition indices `[1,1,1,1,2,2,2,2, ...]`
    '''
    avg = size / float(num)-1
    out = []
    last = 0.0

    index = 0
    while last < size:
        out.append([index]*num)
        last += avg
        index += 1
    
    out = reduce(lambda x,y: x+y,out)[:size]

    return out

 # set up
 df = ...
 n = 1000 # number of records per partition

 # convert list to a spark dataframe
 index = partitionIt(df.count(),n)
 b = spark.createDataFrame([(l,) for l in index], ['Index'])

 # add index and join both dataframe to get the final result
 df = df.withColumn("row_idx", row_number().over(Window.orderBy(monotonically_increasing_id())))
 b = b.withColumn("row_idx", row_number().over(Window.orderBy(monotonically_increasing_id())))

 df = df.join(b, df.row_idx == b.row_idx,'left').\
             drop("row_idx")

 # save
 df.write.partitionBy('Index').save('')
	from pyspark.sql.functions import monotonically_increasing_id, row_number
	from pyspark.sql import Window
	from functools import reduce

	def partitionIt(size, num):
	'''
	Create a list of partition indices each of size num where number of groups is ceiling(len(seq)/num)

	Args:
	size (int): number of rows/elemets
	num (int) : number of elements in partition

	Return:
	a list with partition indices `[1,1,1,1,2,2,2,2, ...]`
	'''
	avg = size / float(num)-1
	out = []
	last = 0.0

	index = 0
	while last < size:
	out.append([index]*num)
	last += avg
	index += 1

	out = reduce(lambda x,y: x+y,out)[:size]

	return out

	# set up
	df = ...
	n = 1000 # number of records per partition

	# convert list to a spark dataframe
	index = partitionIt(df.count(),n)
	b = spark.createDataFrame([(l,) for l in index], ['Index'])

	# add index and join both dataframe to get the final result
	df = df.withColumn("row_idx", row_number().over(Window.orderBy(monotonically_increasing_id())))
	b = b.withColumn("row_idx", row_number().over(Window.orderBy(monotonically_increasing_id())))

	df = df.join(b, df.row_idx == b.row_idx,'left').\
	drop("row_idx")

	# save
	df.write.partitionBy('Index').save('')