Jamie Thomson jamiekt

An example of how to count distinct values in a column using pyspark

	select count(distinct basket) as number_of_baskets_purchased
	, count(distinct product) as number_of_products_purchased
	, count(distinct customer) as number_of_customers_purchasing
	, count(distinct store) as number_of_stores_purchased_in
	from data

	from pyspark.sql.functions import col, lit
	# df is a Spark DataFrame: DataFrame[basket: string, product: string, customer: string, store: string]
	baskets_tally = df.groupBy().agg(countDistinct(col('basket'))).collect()[0][0]
	df = df.groupBy(col('basket')).count().withColumnRenamed('count', 'tally_of_products_per_basket')
	df = df.groupBy("tally_of_products_per_basket") \
	.count() \
	.withColumnRenamed('count', 'tally_of_baskets_containing_products_tally') \
	.orderBy(col("tally_of_products_per_basket").asc())
	df = df.withColumn(
	'fraction_of_baskets_containing_products_tally',

	FROM ubuntu:18.04
	RUN apt-get update
	RUN apt-get upgrade -y
	RUN apt-get install python python-pip
	RUN pip install sklearn

	resource "google_bigquery_dataset" "dataset1"{
	dataset_id = "dataset1"
	}

	resource "google_bigquery_table" "view1"{
	dataset_id = "${google_bigquery_dataset.dataset1.dataset_id}"
	table_id = "view1"

	view {
	query = "select 1 as col union all select 2 union all select 3"

	resource "google_bigquery_table" "view2"{
	dataset_id = "${google_bigquery_dataset.dataset1.dataset_id}"
	table_id = "view2"

	view {
	query = "select * from `${var.project}.dataset1.view1`"
	use_legacy_sql = false
	depends_on = ["google_bigquery_table.view1"]
	}
	}

	gcloud init #choose the project that you will be deploying to
	gcloud services enable dataproc.googleapis.com sqladmin.googleapis.com
	export GCP_PROJECT=$(gcloud config get-value project)
	gsutil mb gs://${GCP_PROJECT}-tf-state #terraform state bucket used as the back-end for the Google provider
	make init apply

	callBQStoredProc:
	params: [stored_proc_name, project_id, poll_wait_time_seconds: 5, bq_location: EU]
	steps:
	- init:
	assign:
	- job:
	jobReference:
	location: $${bq_location}
	configuration:
	query: