mauliksoneji · October 28, 2019 06:37
diff --git a/gcs_spark_client.py b/gcs_spark_client.py
 class GCSClient(object):
    def __init__(self, spark, projectId):
        self.spark = spark
        self.spark._jsc.hadoopConfiguration().set("fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem")
        self.spark._jsc.hadoopConfiguration().set("fs.AbstractFileSystem.gs.impl",
                                             "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS")
        self.spark._jsc.hadoopConfiguration().set("fs.gs.project.id", projectId)
        #spark._jsc.hadoopConfiguration().set("fs.gs.auth.service.account.email", "/hadoop/bq/key.json")
        self.spark._jsc.hadoopConfiguration().set("fs.gs.auth.service.account.enable", "true")

    def get_json_as_df(self, path):
        return self.spark.read.json(path=path)

    def get_parquet_as_df(self,path):
        if isinstance(path,list):
            return self.spark.read.parquet(*path)
        else:
            return self.spark.read.parquet(path)

    def export_to_gcs(self, data_frame,export_dir):
        output_path = self.spark._jvm.org.apache.hadoop.fs.Path(export_dir)
        output_path.getFileSystem(self.spark._jsc.hadoopConfiguration()).delete(output_path, True)

        data_frame.write.format('parquet').save(export_dir)
	class GCSClient(object):
	def __init__(self, spark, projectId):
	self.spark = spark
	self.spark._jsc.hadoopConfiguration().set("fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem")
	self.spark._jsc.hadoopConfiguration().set("fs.AbstractFileSystem.gs.impl",
	"com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS")
	self.spark._jsc.hadoopConfiguration().set("fs.gs.project.id", projectId)
	#spark._jsc.hadoopConfiguration().set("fs.gs.auth.service.account.email", "/hadoop/bq/key.json")
	self.spark._jsc.hadoopConfiguration().set("fs.gs.auth.service.account.enable", "true")

	def get_json_as_df(self, path):
	return self.spark.read.json(path=path)

	def get_parquet_as_df(self,path):
	if isinstance(path,list):
	return self.spark.read.parquet(*path)
	else:
	return self.spark.read.parquet(path)

	def export_to_gcs(self, data_frame,export_dir):
	output_path = self.spark._jvm.org.apache.hadoop.fs.Path(export_dir)
	output_path.getFileSystem(self.spark._jsc.hadoopConfiguration()).delete(output_path, True)

	data_frame.write.format('parquet').save(export_dir)