jhuangtw · April 5, 2022 09:38
diff --git a/databricks_connection_to_gcloud_sql.py b/databricks_connection_to_gcloud_sql.py
 # Overview:
 # Install cloud_sql_proxy binary with appropriate credentials on all driver and worker nodes.

 # References:
 # https://cloud.google.com/sql/docs/postgres/connect-external-app#proxy
 # https://cloud.google.com/sql/docs/postgres/sql-proxy#authentication-options

 # Step 1.
 # Follow https://cloud.google.com/sql/docs/postgres/connect-external-app#proxy to set up:
 # - enable Cloud SQL API
 # - service account credentials, with "Cloud SQL Client" role. download the .json credential file
 # - find your instance connection name from the Cloud SQL Instance details page

 # Step 2.
 # set up a databricks initscript:
 dbutils.fs.put("/databricks/init/cluster_name/gcloud.sh",
  """
  #!/bin/bash

  wget -q https://dl.google.com/cloudsql/cloud_sql_proxy.linux.amd64 -O cloud_sql_proxy
  chmod +x cloud_sql_proxy
  ./cloud_sql_proxy -instances=_your_cloud_sql_instance_name_=tcp:3306 -credential_file=/path/to/gcloud_creds.json 2>&1 &
  
  echo "finished installing gcloud postgres proxy"
  """
 )
 # Run the above code in notebook, and (re)start cluster to pick-up the init script results

 # Step 3.
 # now in a notebook attached to the cluster initialized as above:
 jdbcHostname = "127.0.0.1"  # talk to local proxy
 jdbcDatabase = "database_name"
 jdbcPort = 3306  # this should line up with the tcp port number specified in init script
 username = "db_username"
 password = "db_password"
 # this example show's postgresql, I'd imagine mySQL to be quite similar
 jdbcUrl = "jdbc:postgresql://{0}:{1}/{2}?user={3}&password={4}".format(jdbcHostname, jdbcPort, jdbcDatabase, username, password)

 tableName = '_your_table_name_'

 spark.read \
     .format("jdbc") \
     .option("driver", "org.postgresql.Driver") \
     .option("url", jdbcUrl) \
     .option("dbtable", tableName) \
     .load()
	# Overview:
	# Install cloud_sql_proxy binary with appropriate credentials on all driver and worker nodes.

	# References:
	# https://cloud.google.com/sql/docs/postgres/connect-external-app#proxy
	# https://cloud.google.com/sql/docs/postgres/sql-proxy#authentication-options

	# Step 1.
	# Follow https://cloud.google.com/sql/docs/postgres/connect-external-app#proxy to set up:
	# - enable Cloud SQL API
	# - service account credentials, with "Cloud SQL Client" role. download the .json credential file
	# - find your instance connection name from the Cloud SQL Instance details page

	# Step 2.
	# set up a databricks initscript:
	dbutils.fs.put("/databricks/init/cluster_name/gcloud.sh",
	"""
	#!/bin/bash

	wget -q https://dl.google.com/cloudsql/cloud_sql_proxy.linux.amd64 -O cloud_sql_proxy
	chmod +x cloud_sql_proxy
	./cloud_sql_proxy -instances=_your_cloud_sql_instance_name_=tcp:3306 -credential_file=/path/to/gcloud_creds.json 2>&1 &

	echo "finished installing gcloud postgres proxy"
	"""
	)
	# Run the above code in notebook, and (re)start cluster to pick-up the init script results

	# Step 3.
	# now in a notebook attached to the cluster initialized as above:
	jdbcHostname = "127.0.0.1" # talk to local proxy
	jdbcDatabase = "database_name"
	jdbcPort = 3306 # this should line up with the tcp port number specified in init script
	username = "db_username"
	password = "db_password"
	# this example show's postgresql, I'd imagine mySQL to be quite similar
	jdbcUrl = "jdbc:postgresql://{0}:{1}/{2}?user={3}&password={4}".format(jdbcHostname, jdbcPort, jdbcDatabase, username, password)

	tableName = '_your_table_name_'

	spark.read \
	.format("jdbc") \
	.option("driver", "org.postgresql.Driver") \
	.option("url", jdbcUrl) \
	.option("dbtable", tableName) \
	.load()