Created
December 5, 2017 17:37
-
-
Save mrandrewandrade/78fed9f02119d5b55a50ca55a66f6297 to your computer and use it in GitHub Desktop.
Code examples from Spark The Definitive Guide
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
dataset_path = "/path/to/Spark-The-Definitive-Guide/data/" | |
myRange = spark.range(1000).toDF("number") | |
divisBy2 = myRange.where("number % 2 = 0") | |
divisBy2.count() | |
flightData2015 = spark.read.option("inferSchema","true").option("header","true").csv(dataset_path + "flight-data/csv/2015-summary.csv") | |
flightData2015.take(3) | |
flightData2015.sort("count").explain() | |
flightData2015.sort("count").take(2) | |
flightData2015.createOrReplaceTempView("flight_data_2015") | |
sqlWay = spark.sql( """ SELECT DEST_COUNTRY_NAME, count(1) FROM flight_data_2015 GROUP BY DEST_COUNTRY_NAME """) | |
sqlWay.explain() | |
dataFrameWay = flightData2015.groupBy("DEST_COUNTRY_NAME").count() | |
dataFrameWay.explain() | |
from pyspark.sql.functions import max | |
flightData2015.select(max("count")).take(1) | |
maxSql = spark.sql("""SELECT DEST_COUNTRY_NAME, sum(count) as destination_total FROM flight_data_2015 GROUP BY DEST_COUNTRY_NAME ORDER BY sum(count) DESC LIMIT 5 """) | |
maxSql.collect() | |
from pyspark.sql.functions import desc | |
flightData2015.groupBy("DEST_COUNTRY_NAME").sum("count").withColumnRenamed("sum(count)","destination_total").sort(desc("destination_total")).limit(5).collect() | |
DF1 = spark.read.format("csv").option("inferSchema", "true").option("header","true").load(dataset_path + "flight-data/csv/2015-summary.csv") | |
DF2 = DF1.groupBy("DEST_COUNTRY_NAME").count().collect() | |
DF3 = DF1.groupBy("ORIGIN_COUNTRY_NAME").count().collect() | |
DF4 = DF1.groupBy("count").count().collect() | |
staticDataFrame = spark.read.format("csv").option("header", "true").option("inferSchema", "true").load("dataset_path + "retail-data/by-day/*.csv") | |
staticDataFrame.createOrReplaceTempView("retail_data") | |
staticSchema = staticDataFrame.schema | |
from pyspark.sql.functions import window, column, desc, col | |
staticDataFrame.selectExpr("CustomerId","(UnitPrice * Quantity) as total_cost" ,"InvoiceDate" ).groupBy(col("CustomerId"), window(col("InvoiceDate"), "1 day")).sum("total_cost").orderBy(desc("sum(total_cost)")).take(5) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment