freemandealer · February 16, 2023 12:50
diff --git a/sparse_data_gen.py b/sparse_data_gen.py
 #!/usr/bin/env python
 # pip install pyspark before execution
 import random
 import json
 import os
 import sys
 from pyspark.sql import SparkSession
 from pyspark import SparkContext

 ROW_NUM = 500
 COL_NUM = 50

 def delete_file_or_dir(path):
    if os.path.exists(path):
        if os.path.isfile(path):
            os.remove(path)
        elif os.path.isdir(path):
            os.system('rm -rf ' + path)


 def random_fruit():
    fruits = ["Apple", "Banana", "Orange", "Grapes", "Strawberry", "Mango", "Kiwi", "Pineapple", "Cherry", "Watermelon", "Lemon", "Peach", "Plum", "Lychee", "Avocado", "Blueberry", "Raspberry"]
    return random.choice(fruits)


 def gen_sparse_json():
    with open('output.json', 'a+') as f:
        for i in range(1, ROW_NUM+1):
            row = {'col_0':i}
            for j in range(1, COL_NUM):
                row['col_'+str(j)] = random_fruit()
            # print(row)
            #data = json.dumps(result, indent=1)
        
            f.write(str(row)+"\n")


 def convert_json_to_orc():
    # enlarge memory, otherwise jvm OOM
    SparkContext.setSystemProperty('spark.executor.memory', '300g')
    sc = SparkContext("local", "spark.py")
    spark = SparkSession.builder.getOrCreate()
    df = spark.read.load("output.json", format="json")
    #df.printSchema()
    # coalesce to stop spliting large orc file (we want one output)
    df.coalesce(1).write.format("orc").save("output.orc")


 delete_file_or_dir('output.json')
 delete_file_or_dir('output.orc')
 gen_sparse_json()
 convert_json_to_orc()
	#!/usr/bin/env python
	# pip install pyspark before execution
	import random
	import json
	import os
	import sys
	from pyspark.sql import SparkSession
	from pyspark import SparkContext

	ROW_NUM = 500
	COL_NUM = 50

	def delete_file_or_dir(path):
	if os.path.exists(path):
	if os.path.isfile(path):
	os.remove(path)
	elif os.path.isdir(path):
	os.system('rm -rf ' + path)


	def random_fruit():
	fruits = ["Apple", "Banana", "Orange", "Grapes", "Strawberry", "Mango", "Kiwi", "Pineapple", "Cherry", "Watermelon", "Lemon", "Peach", "Plum", "Lychee", "Avocado", "Blueberry", "Raspberry"]
	return random.choice(fruits)


	def gen_sparse_json():
	with open('output.json', 'a+') as f:
	for i in range(1, ROW_NUM+1):
	row = {'col_0':i}
	for j in range(1, COL_NUM):
	row['col_'+str(j)] = random_fruit()
	# print(row)
	#data = json.dumps(result, indent=1)

	f.write(str(row)+"\n")


	def convert_json_to_orc():
	# enlarge memory, otherwise jvm OOM
	SparkContext.setSystemProperty('spark.executor.memory', '300g')
	sc = SparkContext("local", "spark.py")
	spark = SparkSession.builder.getOrCreate()
	df = spark.read.load("output.json", format="json")
	#df.printSchema()
	# coalesce to stop spliting large orc file (we want one output)
	df.coalesce(1).write.format("orc").save("output.orc")


	delete_file_or_dir('output.json')
	delete_file_or_dir('output.orc')
	gen_sparse_json()
	convert_json_to_orc()