Skip to content

Instantly share code, notes, and snippets.

@cjac
Created January 24, 2025 18:42
Show Gist options
  • Save cjac/bc82ef841a9f88360a64fd8181f3adeb to your computer and use it in GitHub Desktop.
Save cjac/bc82ef841a9f88360a64fd8181f3adeb to your computer and use it in GitHub Desktop.
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext
from pyspark.sql import DataFrame
sc = pyspark.SparkContext()
sqlCtx = SQLContext(sc)
spark = SparkSession \
.builder \
.master('yarn') \
.appName('spark-bigquery-demo') \
.getOrCreate()
#// Use the Cloud Storage bucket for temporary BigQuery export data used
#// by the connector.
bucket = "gs://cjac-docker-on-yarn/spark-bigquery-demo/"
spark.conf.set("temporaryGcsBucket", bucket)
dataDictionary = [
('James',{'hair':'black','eye':'brown','number':1000000000000000000000000000000000000}),
('Michael',{'hair':'brown','eye':None,'number':9999999999999999999999999999999999999}),
('Robert',{'hair':'red','eye':'black','number':4500000000000000000000000000000000000}),
('Washington',{'hair':'red','eye':'grey','number':420}),
('Jefferson',{'hair':'red','eye':'','number':380})
]
#load data into a DataFrame object:
reproductionDf = spark.createDataFrame(data=dataDictionary, schema = ["name","properties"])
reproductionDf.createOrReplaceTempView("reproduction_view")
spark.sql('''
SELECT * FROM reproduction_view
''').show(truncate=False)
(reproductionDf.write.format("bigquery")
.option("table","reproduction_dataset.reproduction_output")
.save())
spark.catalog.dropTempView("reproduction_view")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment