mrandrewandrade · December 5, 2017 17:37
diff --git a/spark-the-definitive-guide.py b/spark-the-definitive-guide.py
 dataset_path = "/path/to/Spark-The-Definitive-Guide/data/"

 myRange = spark.range(1000).toDF("number")

 divisBy2 = myRange.where("number % 2 = 0")

 divisBy2.count()

 flightData2015 = spark.read.option("inferSchema","true").option("header","true").csv(dataset_path + "flight-data/csv/2015-summary.csv")

 flightData2015.take(3)

 flightData2015.sort("count").explain()

 flightData2015.sort("count").take(2)

 flightData2015.createOrReplaceTempView("flight_data_2015")

 sqlWay = spark.sql( """ SELECT DEST_COUNTRY_NAME, count(1) FROM flight_data_2015 GROUP BY DEST_COUNTRY_NAME """)
 sqlWay.explain()

 dataFrameWay = flightData2015.groupBy("DEST_COUNTRY_NAME").count()
 dataFrameWay.explain()

 from pyspark.sql.functions import max
 flightData2015.select(max("count")).take(1)

 maxSql = spark.sql("""SELECT DEST_COUNTRY_NAME, sum(count) as destination_total FROM flight_data_2015 GROUP BY DEST_COUNTRY_NAME ORDER BY sum(count) DESC LIMIT 5 """)

 maxSql.collect()

 from pyspark.sql.functions import desc

 flightData2015.groupBy("DEST_COUNTRY_NAME").sum("count").withColumnRenamed("sum(count)","destination_total").sort(desc("destination_total")).limit(5).collect()

 DF1 = spark.read.format("csv").option("inferSchema", "true").option("header","true").load(dataset_path + "flight-data/csv/2015-summary.csv")

 DF2 = DF1.groupBy("DEST_COUNTRY_NAME").count().collect()

 DF3 = DF1.groupBy("ORIGIN_COUNTRY_NAME").count().collect()

 DF4 = DF1.groupBy("count").count().collect()

 staticDataFrame = spark.read.format("csv").option("header", "true").option("inferSchema", "true").load("dataset_path + "retail-data/by-day/*.csv")

 staticDataFrame.createOrReplaceTempView("retail_data")

 staticSchema = staticDataFrame.schema

 from pyspark.sql.functions import window, column, desc, col
 staticDataFrame.selectExpr("CustomerId","(UnitPrice * Quantity) as total_cost" ,"InvoiceDate" ).groupBy(col("CustomerId"), window(col("InvoiceDate"), "1 day")).sum("total_cost").orderBy(desc("sum(total_cost)")).take(5)
	dataset_path = "/path/to/Spark-The-Definitive-Guide/data/"

	myRange = spark.range(1000).toDF("number")

	divisBy2 = myRange.where("number % 2 = 0")

	divisBy2.count()

	flightData2015 = spark.read.option("inferSchema","true").option("header","true").csv(dataset_path + "flight-data/csv/2015-summary.csv")

	flightData2015.take(3)

	flightData2015.sort("count").explain()

	flightData2015.sort("count").take(2)

	flightData2015.createOrReplaceTempView("flight_data_2015")

	sqlWay = spark.sql( """ SELECT DEST_COUNTRY_NAME, count(1) FROM flight_data_2015 GROUP BY DEST_COUNTRY_NAME """)
	sqlWay.explain()

	dataFrameWay = flightData2015.groupBy("DEST_COUNTRY_NAME").count()
	dataFrameWay.explain()

	from pyspark.sql.functions import max
	flightData2015.select(max("count")).take(1)

	maxSql = spark.sql("""SELECT DEST_COUNTRY_NAME, sum(count) as destination_total FROM flight_data_2015 GROUP BY DEST_COUNTRY_NAME ORDER BY sum(count) DESC LIMIT 5 """)

	maxSql.collect()

	from pyspark.sql.functions import desc

	flightData2015.groupBy("DEST_COUNTRY_NAME").sum("count").withColumnRenamed("sum(count)","destination_total").sort(desc("destination_total")).limit(5).collect()

	DF1 = spark.read.format("csv").option("inferSchema", "true").option("header","true").load(dataset_path + "flight-data/csv/2015-summary.csv")

	DF2 = DF1.groupBy("DEST_COUNTRY_NAME").count().collect()

	DF3 = DF1.groupBy("ORIGIN_COUNTRY_NAME").count().collect()

	DF4 = DF1.groupBy("count").count().collect()

	staticDataFrame = spark.read.format("csv").option("header", "true").option("inferSchema", "true").load("dataset_path + "retail-data/by-day/*.csv")

	staticDataFrame.createOrReplaceTempView("retail_data")

	staticSchema = staticDataFrame.schema

	from pyspark.sql.functions import window, column, desc, col
	staticDataFrame.selectExpr("CustomerId","(UnitPrice * Quantity) as total_cost" ,"InvoiceDate" ).groupBy(col("CustomerId"), window(col("InvoiceDate"), "1 day")).sum("total_cost").orderBy(desc("sum(total_cost)")).take(5)