Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save edonosotti/1fab01f479ae4f76d145972294d9a9b1 to your computer and use it in GitHub Desktop.
Save edonosotti/1fab01f479ae4f76d145972294d9a9b1 to your computer and use it in GitHub Desktop.
# ==================================================
# Create a BigQuery Data Transfer job with Terraform
# ==================================================
# This code was written for this article:
# https://medium.com/@edonosotti/importing-big-data-in-bigquery-with-data-transfer-service-and-terraform-dc55fdfdc821
#
# CSV file template: <<EOF
# customer_id,card_id,amount
# 1,0001,10.00
# 2,0002,2.00
# 3,0003,8.00
# EOF
# --------------------------------------------------
# VARIABLES
# Set these before applying the configuration
# Do not store sensitive values in a configuration
# file, pass them at runtime via environment
# variables or with `-var`
# --------------------------------------------------
variable project_id {
type = string
description = "Google Cloud Platform Project ID"
}
variable region {
type = string
description = "Google Cloud Platform region"
}
variable aws_s3_bucket_name {
type = string
description = "Name of the AWS S3 bucket"
}
variable aws_access_key_id {
type = string
description = "AWS Access Key ID"
}
variable aws_secret_access_key {
type = string
description = "AWS Secret Access Key"
}
# --------------------------------------------------
# TERRAFORM CONFIGURATION
# Setting the provider and the project
# --------------------------------------------------
provider "google" {
project = var.project_id
region = var.region
zone = var.zone
user_project_override = true
}
# --------------------------------------------------
# RESOURCES
# Note the comments below
# --------------------------------------------------
# Enable the required services
resource "google_project_service" "services" {
for_each = toset([
"bigquery.googleapis.com",
"bigquerydatatransfer.googleapis.com",
"bigquerystorage.googleapis.com"
])
project = var.project_id
service = each.key
disable_dependent_services = false
disable_on_destroy = false
}
# Create the BigQuery dataset
resource "google_bigquery_dataset" "import_dataset" {
dataset_id = "import_data"
location = var.region
depends_on = [google_project_service.services]
}
# Create the destination table
resource "google_bigquery_table" "import_transactions" {
dataset_id = google_bigquery_dataset.import_dataset.dataset_id
table_id = "import_transactions"
# Enable table partitioning per-day
time_partitioning {
type = "DAY"
}
schema = jsonencode([
{
"name" = "customer_id",
"type" = "INT64",
"mode" = "STRING",
"description" = "Customer ID"
},
{
"name" = "card_id",
"type" = "STRING",
"mode" = "REQUIRED",
"description" = "Card ID"
},
{
"name" = "amount",
"type" = "FLOAT64",
"mode" = "REQUIRED",
"description" = "Paid amount"
}
])
depends_on = [google_bigquery_dataset.import_dataset]
}
# Create the scheduled data transfer
resource "google_bigquery_data_transfer_config" "import_connections" {
project = var.project_id
display_name = "import-transactions"
location = var.region
data_source_id = "amazon_s3"
schedule = "every day 00:30"
destination_dataset_id = google_bigquery_dataset.import_dataset.dataset_id
params = {
destination_table_name_template = google_bigquery_table.import_transactions.table_id
# Note the complex path with wildcards and variables
data_path = "s3://${var.aws_s3_bucket_name}/Transactions/{run_time-24h|\"%Y-%m-%d\"} */CreditCard-*.csv"
access_key_id = var.aws_access_key_id
file_format = "CSV"
max_bad_records = 0
ignore_unknown_values = false
field_delimiter = ","
skip_leading_rows = 0
allow_quoted_newlines = false
allow_jagged_rows = false
}
sensitive_params {
secret_access_key = var.aws_secret_access_key
}
email_preferences {
enable_failure_email = true
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment