pip install --upgrade google-cloud-bigqueryfrom google.cloud import bigquery| # Delete all files in bucket | |
| BUCKET_NAME=my_bucket | |
| gsutil -m rm gs://$BUCKET_NAME/** | |
| # Delete bucket | |
| gsutil rb -f gs://$BUCKET_NAME |
| import logging | |
| import boto3 | |
| from botocore.exceptions import ClientError, BotoCoreError | |
| import requests | |
| from requests import RequestException | |
| def get_instance_name(): | |
| try: |
| import boto3 | |
| bucket_name = 'my_bucket_name' | |
| prefix = 'some/path/' | |
| print("Getting list of all folder names in S3 bucket {} under prefix {}".format(bucket_name, prefix)) | |
| folders_list = [] | |
| client = boto3.client('s3') | |
| results = client.list_objects(Bucket=bucket_name, Prefix=prefix, Delimiter='/') | |
| for folder in results.get('CommonPrefixes'): |
| #!/bin/bash -x | |
| # Merge the master branch into the current checked out branch | |
| CURRENT=`git branch | grep "*" | awk '{print $2}'` | |
| git checkout master | |
| git fetch | |
| git merge origin/master | |
| git checkout ${CURRENT} | |
| git merge master ${CURRENT} |
| import boto3 | |
| def get_matching_s3_objects(bucket, prefix='', suffix=''): | |
| """ | |
| Fetch objects in an S3 bucket. | |
| :param bucket: Name of the S3 bucket. | |
| :param prefix: Only fetch objects whose key starts with | |
| this prefix (optional). |
| # Copy zip files from S3 to local directory, unzip and upload to S3 | |
| aws s3 cp s3://bucket/folder/ . --recursive | |
| for f in *.zip; do unzip $f; done | |
| aws s3 cp . s3://bucket/folder/ --recursive --exclude "*.zip" |
| # Google Cloud Composer - Manually trigger DAG runs using Airflow v1.10+ | |
| ENVIRONMENT_NAME=my-composer | |
| LOCATION=us-east1 | |
| # Trigger DAG - individual | |
| DAG_ID=my_daily_dag | |
| EXEC_DATE=2019-02-11 | |
| gcloud composer environments run ${ENVIRONMENT_NAME} --location ${LOCATION} trigger_dag -- -r manual__${EXEC_DATE} -e ${EXEC_DATE} ${DAG_ID} | |
| # Trigger DAG - multiple |
Generate a unique identifier that consistently produces the same result each time based on the values in the row. The ID column will be the first column positioned in the DataFrame.
from pyspark.sql.functions import sha2, concat_ws
columns = df.columns
df = df.withColumn(id_col, sha2(concat_ws("||", *df.columns), 256))
df = df.select([id_col] + columns)
| from google.cloud import kms_v1 | |
| def encrypt(project_id, location_id, key_ring_id, crypto_key_id, plaintext): | |
| """Encrypts input plaintext data using the provided symmetric CryptoKey.""" | |
| # Creates an API client for the KMS API. | |
| client = kms_v1.KeyManagementServiceClient() | |
| # The resource name of the CryptoKey. |