Last active
September 29, 2018 03:54
-
-
Save walteryu/9ab3a6e003699c9e331ec4a044b408bb to your computer and use it in GitHub Desktop.
hw3_p2.py
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
''' | |
Author: Walter Yu | |
Course: CSCI E-63, Fall 2018 | |
Assignment: HW3, Problem 2 | |
References: | |
Slide 46 & 47, Lecture 3 Notes | |
SparkContext Tutorial:https://www.tutorialspoint.com/pyspark/pyspark_sparkcontext.htm | |
''' | |
from pyspark import SparkContext, SparkConf | |
from pyspark.sql import SQLContext, Row, SparkSession | |
def people_parquet(): | |
# Read in json file, save to parquet file then reinitiate: | |
sc = SparkContext(appName = "FirstSession") | |
sqlContext = SQLContext(sc) | |
# Read in json file as dataframe: | |
df = sqlContext.read.load("people.json",format="json") | |
# Show contents of dataframe: | |
print("First Session - Dataframe count:") | |
print(df.count()) | |
print("") | |
print("First Session - Contents of dataframe:") | |
df.show() | |
print("") | |
# Query for name/age and save to parquet file: | |
df.select("name","age").write.save("nameage.parquet",format="parquet") | |
parquetFile = sqlContext.read.parquet("nameage.parquet") | |
# Save as temporary view: | |
parquetFile.registerTempTable("parqf") | |
sp = sqlContext.sql("select * from parqf") | |
# Show contents of temporary table: | |
print("First Session - Contents of names table:") | |
sp.show() | |
# Stop session and reinitiate to show contents of file | |
sc.stop() | |
def view_parquet(): | |
# Read in json file, save to parquet file then reinitiate: | |
sc = SparkContext(appName = "SecondSession") | |
sqlContext = SQLContext(sc) | |
# Query for name/age and save to parquet file: | |
parquetFile = sqlContext.read.parquet("nameage.parquet") | |
parquetFile.registerTempTable("parqf") | |
sp = sqlContext.sql("select * from parqf") | |
# Show contents of table: | |
print("First Session - Dataframe count:") | |
print(sp.count()) | |
print("") | |
# Show contents of temporary table: | |
print("Second Session - Contents of names table:") | |
sp.show() | |
# Stop session | |
sc.stop() | |
if __name__ == '__main__': | |
people_parquet() | |
view_parquet() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment