elowy01 · June 25, 2020 12:09
diff --git a/spark_sql.txt b/spark_sql.txt
 ##### SparkSQL
 from pyspark import SparkContext as sc
 from pyspark.sql import SparkSession

 spark1 = SparkSession.builder.appName('SQL').getOrCreate()

 # read-in a .csv file on stocks
 df = spark1.read.csv('appl_stock.csv',inferSchema=True,header=True)

 # print the schema of the readed data
 df.printSchema()

 # Query the df
 # Create a temporary view¶
 df.createOrReplaceTempView('stock')
 # Now run a simple SQL query directly on this view. It returns a DataFrame
 result = spark1.sql("SELECT * FROM stock LIMIT 5")

 $ results
 DataFrame[Date: timestamp, Open: double, High: double, Low: double, Close: double, Volume: int, Adj Close: double]
 $ result.columns
 ['Date', 'Open', 'High', 'Low', 'Close', 'Volume', 'Adj Close']
 $ result.show()
 +-------------------+----------+----------+------------------+------------------+---------+------------------+
 |               Date|      Open|      High|               Low|             Close|   Volume|         Adj Close|
 +-------------------+----------+----------+------------------+------------------+---------+------------------+
 |2010-01-04 00:00:00|213.429998|214.499996|212.38000099999996|        214.009998|123432400|         27.727039|
 |2010-01-05 00:00:00|214.599998|215.589994|        213.249994|        214.379993|150476200|27.774976000000002|
 |2010-01-06 00:00:00|214.379993|    215.23|        210.750004|        210.969995|138040000|27.333178000000004|
 |2010-01-07 00:00:00|    211.75|212.000006|        209.050005|            210.58|119282800|          27.28265|
 |2010-01-08 00:00:00|210.299994|212.000006|209.06000500000002|211.98000499999998|111902700|         27.464034|
 +-------------------+----------+----------+------------------+------------------+---------+------------------+
 # How many entries in the Close field are higher than 500?
 $ count_greater_500 = spark1.sql("SELECT COUNT(Close) FROM stock WHERE Close > 500").show()

 # Read a file (and create dataframe) by directly running a spark.sql method on the file
 # Notice the syntax of csv.<path->filename.csv> inside the SQL query
 $ df_sales = spark1.sql("SELECT * FROM csv.`Data/sales_info.csv`")
 $ df_sales.show()
	##### SparkSQL
	from pyspark import SparkContext as sc
	from pyspark.sql import SparkSession

	spark1 = SparkSession.builder.appName('SQL').getOrCreate()

	# read-in a .csv file on stocks
	df = spark1.read.csv('appl_stock.csv',inferSchema=True,header=True)

	# print the schema of the readed data
	df.printSchema()

	# Query the df
	# Create a temporary view¶
	df.createOrReplaceTempView('stock')
	# Now run a simple SQL query directly on this view. It returns a DataFrame
	result = spark1.sql("SELECT * FROM stock LIMIT 5")

	$ results
	DataFrame[Date: timestamp, Open: double, High: double, Low: double, Close: double, Volume: int, Adj Close: double]
	$ result.columns
	['Date', 'Open', 'High', 'Low', 'Close', 'Volume', 'Adj Close']
	$ result.show()
	+-------------------+----------+----------+------------------+------------------+---------+------------------+
	\| Date\| Open\| High\| Low\| Close\| Volume\| Adj Close\|
	+-------------------+----------+----------+------------------+------------------+---------+------------------+
	\|2010-01-04 00:00:00\|213.429998\|214.499996\|212.38000099999996\| 214.009998\|123432400\| 27.727039\|
	\|2010-01-05 00:00:00\|214.599998\|215.589994\| 213.249994\| 214.379993\|150476200\|27.774976000000002\|
	\|2010-01-06 00:00:00\|214.379993\| 215.23\| 210.750004\| 210.969995\|138040000\|27.333178000000004\|
	\|2010-01-07 00:00:00\| 211.75\|212.000006\| 209.050005\| 210.58\|119282800\| 27.28265\|
	\|2010-01-08 00:00:00\|210.299994\|212.000006\|209.06000500000002\|211.98000499999998\|111902700\| 27.464034\|
	+-------------------+----------+----------+------------------+------------------+---------+------------------+
	# How many entries in the Close field are higher than 500?
	$ count_greater_500 = spark1.sql("SELECT COUNT(Close) FROM stock WHERE Close > 500").show()

	# Read a file (and create dataframe) by directly running a spark.sql method on the file
	# Notice the syntax of csv.<path->filename.csv> inside the SQL query
	$ df_sales = spark1.sql("SELECT * FROM csv.`Data/sales_info.csv`")
	$ df_sales.show()