mh0w · October 19, 2023 14:21 · mh0w · Oct 18, 2023
diff --git a/Pyspark or Pydoop - json (read and write) locally or on hdfs.py b/Pyspark or Pydoop - json (read and write) locally or on hdfs.py
 from pyspark.sql import SparkSession
 import databricks.koalas as ks
 import json
 import pydoop.hdfs as hdfs

 spark = SparkSession.builder.enableHiveSupport().getOrCreate()

 my_data = {"name": ["John", "Mary", "Kevin"],
        "area": ["London", "Munich", "Berlin"],
        "age":  [33, 56, 44]}


 ############################################
 ############################################
 # Working with json files on a local drive #
 ############################################
 ############################################

 # Write dict to json file
 with open("db.json", "w") as f:
    json.dump(my_data, f, indent=4)

 # Read json file
 with open("db.json", "r") as f:
    my_data = json.load(f)


 ###################################
 ###################################
 # Working with json files on HDFS #
 ###################################
 ###################################

 file_path = '/dapsen/path/to/my/file.json'

 ######################
 # Write json to hdfs #
 ######################

 # Using Pydoop - wt means write as plain text
 with hdfs.open(file_path, 'wt') as file:
    json.dump(my_data, file)

 # Using PySpark/Koalas
 ks.DataFrame(my_data).to_json(my_path + "export_test", num_files=1)

 #######################
 # Read json from hdfs #
 #######################

 # Using Pydoop - read the json file back from hdfs to check it worked properly
 with hdfs.open(file_path, 'r') as file:
    file_imported_from_hdfs = json.load(file)

 # Using Koalas
 koalas_df = ks.read_json(my_path + "export_test")

 # Using PySpark read.json()
 df = spark.read.json(my_path + "export_test")

 # Using PySpark read.format.load()
 df = spark.read.format('org.apache.spark.sql.json').load(my_path + "export_test")

 # Using PySpark read.option().json()
 df = spark.read.option("multiline", "true").json(my_path + "export_test")

 # Then view the data...
 koalas_df.head(5)
 df.printSchema()
 df.show()
	from pyspark.sql import SparkSession
	import databricks.koalas as ks
	import json
	import pydoop.hdfs as hdfs

	spark = SparkSession.builder.enableHiveSupport().getOrCreate()

	my_data = {"name": ["John", "Mary", "Kevin"],
	"area": ["London", "Munich", "Berlin"],
	"age": [33, 56, 44]}


	############################################
	############################################
	# Working with json files on a local drive #
	############################################
	############################################

	# Write dict to json file
	with open("db.json", "w") as f:
	json.dump(my_data, f, indent=4)

	# Read json file
	with open("db.json", "r") as f:
	my_data = json.load(f)


	###################################
	###################################
	# Working with json files on HDFS #
	###################################
	###################################

	file_path = '/dapsen/path/to/my/file.json'

	######################
	# Write json to hdfs #
	######################

	# Using Pydoop - wt means write as plain text
	with hdfs.open(file_path, 'wt') as file:
	json.dump(my_data, file)

	# Using PySpark/Koalas
	ks.DataFrame(my_data).to_json(my_path + "export_test", num_files=1)

	#######################
	# Read json from hdfs #
	#######################

	# Using Pydoop - read the json file back from hdfs to check it worked properly
	with hdfs.open(file_path, 'r') as file:
	file_imported_from_hdfs = json.load(file)

	# Using Koalas
	koalas_df = ks.read_json(my_path + "export_test")

	# Using PySpark read.json()
	df = spark.read.json(my_path + "export_test")

	# Using PySpark read.format.load()
	df = spark.read.format('org.apache.spark.sql.json').load(my_path + "export_test")

	# Using PySpark read.option().json()
	df = spark.read.option("multiline", "true").json(my_path + "export_test")

	# Then view the data...
	koalas_df.head(5)
	df.printSchema()
	df.show()