Skip to content

Instantly share code, notes, and snippets.

@tilakpatidar
Created September 6, 2017 10:46
Show Gist options
  • Save tilakpatidar/b5ce75fe72e121c31a54a5d15d0801d4 to your computer and use it in GitHub Desktop.
Save tilakpatidar/b5ce75fe72e121c31a54a5d15d0801d4 to your computer and use it in GitHub Desktop.
dummy
import org.apache.spark.sql.functions.udf
import spark.sessionState.conf
conf.setConfString("spark.sql.pivotMaxValues", "" + Int.MaxValue)
val csv = spark.read.format("csv").option("header", true).load("/Users/tilak/Downloads/Pam/SalesAnalysis/data/store_sales_unified_2017.csv")
val uniqueKey: (String, String, String, String) => String = (x, y, z, v) => x + "_" + y + "_" + z + "_" + v
val someFn = udf(uniqueKey)
val newData = csv.withColumn("unique", someFn(csv.col("receipt_id"), csv.col("cash_register_id"), csv.col("sale_time"), csv.col("date")))
val countArticles = newData.groupBy("unique", "article_id").count()
var articles = countArticles.select("article_id").distinct()
val articleIds = articles.collect.map(x => x(0))
val putBoolean: (Long) => String = (x) => if (x > 0) "T" else "F"
val someFn3 = udf(putBoolean)
val pivoted = countArticles.groupBy("unique").pivot("article_id", articleIds).agg(someFn3(sum("count"))).na.fill("F")
pivoted.coalesce(1).write.option("header", "true").csv("final235.csv")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment