Created
February 16, 2023 12:50
-
-
Save freemandealer/aef136754d258e77d6ea6acd314eeddd to your computer and use it in GitHub Desktop.
Generate LowCardinality Data as JSON & ORC format
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# pip install pyspark before execution | |
import random | |
import json | |
import os | |
import sys | |
from pyspark.sql import SparkSession | |
from pyspark import SparkContext | |
ROW_NUM = 500 | |
COL_NUM = 50 | |
def delete_file_or_dir(path): | |
if os.path.exists(path): | |
if os.path.isfile(path): | |
os.remove(path) | |
elif os.path.isdir(path): | |
os.system('rm -rf ' + path) | |
def random_fruit(): | |
fruits = ["Apple", "Banana", "Orange", "Grapes", "Strawberry", "Mango", "Kiwi", "Pineapple", "Cherry", "Watermelon", "Lemon", "Peach", "Plum", "Lychee", "Avocado", "Blueberry", "Raspberry"] | |
return random.choice(fruits) | |
def gen_sparse_json(): | |
with open('output.json', 'a+') as f: | |
for i in range(1, ROW_NUM+1): | |
row = {'col_0':i} | |
for j in range(1, COL_NUM): | |
row['col_'+str(j)] = random_fruit() | |
# print(row) | |
#data = json.dumps(result, indent=1) | |
f.write(str(row)+"\n") | |
def convert_json_to_orc(): | |
# enlarge memory, otherwise jvm OOM | |
SparkContext.setSystemProperty('spark.executor.memory', '300g') | |
sc = SparkContext("local", "spark.py") | |
spark = SparkSession.builder.getOrCreate() | |
df = spark.read.load("output.json", format="json") | |
#df.printSchema() | |
# coalesce to stop spliting large orc file (we want one output) | |
df.coalesce(1).write.format("orc").save("output.orc") | |
delete_file_or_dir('output.json') | |
delete_file_or_dir('output.orc') | |
gen_sparse_json() | |
convert_json_to_orc() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment