edonosotti · April 13, 2022 16:11
diff --git a/terraform_create_bigquery_data_transfer_job.tf b/terraform_create_bigquery_data_transfer_job.tf
 # ==================================================
 # Create a BigQuery Data Transfer job with Terraform
 # ==================================================
 # This code was written for this article:
 # https://medium.com/@edonosotti/importing-big-data-in-bigquery-with-data-transfer-service-and-terraform-dc55fdfdc821
 #
 # CSV file template: <<EOF
 # customer_id,card_id,amount
 # 1,0001,10.00
 # 2,0002,2.00
 # 3,0003,8.00
 # EOF

 # --------------------------------------------------
 # VARIABLES
 # Set these before applying the configuration
 # Do not store sensitive values in a configuration
 # file, pass them at runtime via environment
 # variables or with `-var`
 # --------------------------------------------------
 variable project_id {
  type        = string
  description = "Google Cloud Platform Project ID"
 }

 variable region {
  type        = string
  description = "Google Cloud Platform region"
 }

 variable aws_s3_bucket_name {
  type        = string
  description = "Name of the AWS S3 bucket"
 }

 variable aws_access_key_id {
  type        = string
  description = "AWS Access Key ID"
 }

 variable aws_secret_access_key {
  type        = string
  description = "AWS Secret Access Key"
 }

 # --------------------------------------------------
 # TERRAFORM CONFIGURATION
 # Setting the provider and the project
 # --------------------------------------------------

 provider "google" {
  project               = var.project_id
  region                = var.region
  zone                  = var.zone
  user_project_override = true
 }


 # --------------------------------------------------
 # RESOURCES
 # Note the comments below
 # --------------------------------------------------

 # Enable the required services
 resource "google_project_service" "services" {
  for_each = toset([
    "bigquery.googleapis.com",
    "bigquerydatatransfer.googleapis.com",
    "bigquerystorage.googleapis.com"
  ])

  project                    = var.project_id
  service                    = each.key
  disable_dependent_services = false
  disable_on_destroy         = false
 }

 # Create the BigQuery dataset
 resource "google_bigquery_dataset" "import_dataset" {
  dataset_id                  = "import_data"
  location                    = var.region

  depends_on = [google_project_service.services]
 }

 # Create the destination table
 resource "google_bigquery_table" "import_transactions" {
  dataset_id = google_bigquery_dataset.import_dataset.dataset_id
  table_id   = "import_transactions"

  # Enable table partitioning per-day
  time_partitioning {
    type = "DAY"
  }

  schema = jsonencode([
    {
      "name" = "customer_id",
      "type" = "INT64",
      "mode" = "STRING",
      "description" = "Customer ID"
    },
    {
      "name" = "card_id",
      "type" = "STRING",
      "mode" = "REQUIRED",
      "description" = "Card ID"
    },
    {
      "name" = "amount",
      "type" = "FLOAT64",
      "mode" = "REQUIRED",
      "description" = "Paid amount"
    }
  ])

  depends_on = [google_bigquery_dataset.import_dataset]
 }

 # Create the scheduled data transfer
 resource "google_bigquery_data_transfer_config" "import_connections" {
  project                = var.project_id
  display_name           = "import-transactions"
  location               = var.region
  data_source_id         = "amazon_s3"
  schedule               = "every day 00:30"
  destination_dataset_id = google_bigquery_dataset.import_dataset.dataset_id

  params = {
    destination_table_name_template = google_bigquery_table.import_transactions.table_id
                                    # Note the complex path with wildcards and variables
    data_path                       = "s3://${var.aws_s3_bucket_name}/Transactions/{run_time-24h|\"%Y-%m-%d\"} */CreditCard-*.csv"
    access_key_id                   = var.aws_access_key_id
    file_format                     = "CSV"
    max_bad_records                 = 0
    ignore_unknown_values           = false
    field_delimiter                 = ","
    skip_leading_rows               = 0
    allow_quoted_newlines           = false
    allow_jagged_rows               = false
  }

  sensitive_params {
    secret_access_key               = var.aws_secret_access_key
  }

  email_preferences {
    enable_failure_email            = true
  }
 }
	# ==================================================
	# Create a BigQuery Data Transfer job with Terraform
	# ==================================================
	# This code was written for this article:
	# https://medium.com/@edonosotti/importing-big-data-in-bigquery-with-data-transfer-service-and-terraform-dc55fdfdc821
	#
	# CSV file template: <<EOF
	# customer_id,card_id,amount
	# 1,0001,10.00
	# 2,0002,2.00
	# 3,0003,8.00
	# EOF

	# --------------------------------------------------
	# VARIABLES
	# Set these before applying the configuration
	# Do not store sensitive values in a configuration
	# file, pass them at runtime via environment
	# variables or with `-var`
	# --------------------------------------------------
	variable project_id {
	type = string
	description = "Google Cloud Platform Project ID"
	}

	variable region {
	type = string
	description = "Google Cloud Platform region"
	}

	variable aws_s3_bucket_name {
	type = string
	description = "Name of the AWS S3 bucket"
	}

	variable aws_access_key_id {
	type = string
	description = "AWS Access Key ID"
	}

	variable aws_secret_access_key {
	type = string
	description = "AWS Secret Access Key"
	}

	# --------------------------------------------------
	# TERRAFORM CONFIGURATION
	# Setting the provider and the project
	# --------------------------------------------------

	provider "google" {
	project = var.project_id
	region = var.region
	zone = var.zone
	user_project_override = true
	}


	# --------------------------------------------------
	# RESOURCES
	# Note the comments below
	# --------------------------------------------------

	# Enable the required services
	resource "google_project_service" "services" {
	for_each = toset([
	"bigquery.googleapis.com",
	"bigquerydatatransfer.googleapis.com",
	"bigquerystorage.googleapis.com"
	])

	project = var.project_id
	service = each.key
	disable_dependent_services = false
	disable_on_destroy = false
	}

	# Create the BigQuery dataset
	resource "google_bigquery_dataset" "import_dataset" {
	dataset_id = "import_data"
	location = var.region

	depends_on = [google_project_service.services]
	}

	# Create the destination table
	resource "google_bigquery_table" "import_transactions" {
	dataset_id = google_bigquery_dataset.import_dataset.dataset_id
	table_id = "import_transactions"

	# Enable table partitioning per-day
	time_partitioning {
	type = "DAY"
	}

	schema = jsonencode([
	{
	"name" = "customer_id",
	"type" = "INT64",
	"mode" = "STRING",
	"description" = "Customer ID"
	},
	{
	"name" = "card_id",
	"type" = "STRING",
	"mode" = "REQUIRED",
	"description" = "Card ID"
	},
	{
	"name" = "amount",
	"type" = "FLOAT64",
	"mode" = "REQUIRED",
	"description" = "Paid amount"
	}
	])

	depends_on = [google_bigquery_dataset.import_dataset]
	}

	# Create the scheduled data transfer
	resource "google_bigquery_data_transfer_config" "import_connections" {
	project = var.project_id
	display_name = "import-transactions"
	location = var.region
	data_source_id = "amazon_s3"
	schedule = "every day 00:30"
	destination_dataset_id = google_bigquery_dataset.import_dataset.dataset_id

	params = {
	destination_table_name_template = google_bigquery_table.import_transactions.table_id
	# Note the complex path with wildcards and variables
	data_path = "s3://${var.aws_s3_bucket_name}/Transactions/{run_time-24h\|\"%Y-%m-%d\"} /CreditCard-.csv"
	access_key_id = var.aws_access_key_id
	file_format = "CSV"
	max_bad_records = 0
	ignore_unknown_values = false
	field_delimiter = ","
	skip_leading_rows = 0
	allow_quoted_newlines = false
	allow_jagged_rows = false
	}

	sensitive_params {
	secret_access_key = var.aws_secret_access_key
	}

	email_preferences {
	enable_failure_email = true
	}
	}