Justin Naldzin justinnaldzin

Load data from BigQuery

Using the BigQuery client library

pip install --upgrade google-cloud-bigquery

from google.cloud import bigquery

Unique ID column

Generate a unique identifier that consistently produces the same result each time based on the values in the row. The ID column will be the first column positioned in the DataFrame.

from pyspark.sql.functions import sha2, concat_ws

columns = df.columns
df = df.withColumn(id_col, sha2(concat_ws("||", *df.columns), 256))
df = df.select([id_col] + columns)

	# Delete all files in bucket
	BUCKET_NAME=my_bucket
	gsutil -m rm gs://$BUCKET_NAME/**

	# Delete bucket
	gsutil rb -f gs://$BUCKET_NAME

	import logging

	import boto3
	from botocore.exceptions import ClientError, BotoCoreError
	import requests
	from requests import RequestException


	def get_instance_name():
	try:

	import boto3

	bucket_name = 'my_bucket_name'
	prefix = 'some/path/'

	print("Getting list of all folder names in S3 bucket {} under prefix {}".format(bucket_name, prefix))
	folders_list = []
	client = boto3.client('s3')
	results = client.list_objects(Bucket=bucket_name, Prefix=prefix, Delimiter='/')
	for folder in results.get('CommonPrefixes'):

	#!/bin/bash -x

	# Merge the master branch into the current checked out branch

	CURRENT=`git branch \| grep "*" \| awk '{print $2}'`
	git checkout master
	git fetch
	git merge origin/master
	git checkout ${CURRENT}
	git merge master ${CURRENT}

	import boto3


	def get_matching_s3_objects(bucket, prefix='', suffix=''):
	"""
	Fetch objects in an S3 bucket.

	:param bucket: Name of the S3 bucket.
	:param prefix: Only fetch objects whose key starts with
	this prefix (optional).

	# Copy zip files from S3 to local directory, unzip and upload to S3

	aws s3 cp s3://bucket/folder/ . --recursive
	for f in *.zip; do unzip $f; done
	aws s3 cp . s3://bucket/folder/ --recursive --exclude "*.zip"

	# Google Cloud Composer - Manually trigger DAG runs using Airflow v1.10+
	ENVIRONMENT_NAME=my-composer
	LOCATION=us-east1

	# Trigger DAG - individual
	DAG_ID=my_daily_dag
	EXEC_DATE=2019-02-11
	gcloud composer environments run ${ENVIRONMENT_NAME} --location ${LOCATION} trigger_dag -- -r manual__${EXEC_DATE} -e ${EXEC_DATE} ${DAG_ID}

	# Trigger DAG - multiple

	from google.cloud import kms_v1


	def encrypt(project_id, location_id, key_ring_id, crypto_key_id, plaintext):
	"""Encrypts input plaintext data using the provided symmetric CryptoKey."""

	# Creates an API client for the KMS API.
	client = kms_v1.KeyManagementServiceClient()

	# The resource name of the CryptoKey.