Created
September 6, 2017 10:46
-
-
Save tilakpatidar/b5ce75fe72e121c31a54a5d15d0801d4 to your computer and use it in GitHub Desktop.
dummy
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import org.apache.spark.sql.functions.udf | |
import spark.sessionState.conf | |
conf.setConfString("spark.sql.pivotMaxValues", "" + Int.MaxValue) | |
val csv = spark.read.format("csv").option("header", true).load("/Users/tilak/Downloads/Pam/SalesAnalysis/data/store_sales_unified_2017.csv") | |
val uniqueKey: (String, String, String, String) => String = (x, y, z, v) => x + "_" + y + "_" + z + "_" + v | |
val someFn = udf(uniqueKey) | |
val newData = csv.withColumn("unique", someFn(csv.col("receipt_id"), csv.col("cash_register_id"), csv.col("sale_time"), csv.col("date"))) | |
val countArticles = newData.groupBy("unique", "article_id").count() | |
var articles = countArticles.select("article_id").distinct() | |
val articleIds = articles.collect.map(x => x(0)) | |
val putBoolean: (Long) => String = (x) => if (x > 0) "T" else "F" | |
val someFn3 = udf(putBoolean) | |
val pivoted = countArticles.groupBy("unique").pivot("article_id", articleIds).agg(someFn3(sum("count"))).na.fill("F") | |
pivoted.coalesce(1).write.option("header", "true").csv("final235.csv") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment