Created
November 10, 2014 07:10
-
-
Save pablete/a1a4bdfcca329c0ad2ea to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# sc is an existing SparkContext. | |
from pyspark.sql import SQLContext, Row | |
sqlContext = SQLContext(sc) | |
# Load a text file and convert each line to a dictionary. | |
lines = sc.textFile("examples/src/main/resources/people.txt") | |
parts = lines.map(lambda l: l.split(",")) | |
people = parts.map(lambda p: Row(name=p[0], age=int(p[1]))) | |
#if using CSV instead FROM https://docs.python.org/2/library/csv.html | |
import csv | |
with open('people.csv', 'rb') as csvfile: | |
allpeople = csv.reader(csvfile, delimiter=' ', quotechar='|') | |
for onepeople in allpeople | |
people = onepeople.map(lambda p: Row(name=p[0], age=int(p[1]))) | |
# Infer the schema, and register the SchemaRDD as a table. | |
schemaPeople = sqlContext.inferSchema(people) | |
schemaPeople.registerTempTable("people") | |
# SQL can be run over SchemaRDDs that have been registered as a table. | |
teenagers = sqlContext.sql("SELECT name FROM people WHERE age >= 13 AND age <= 19") | |
# The results of SQL queries are RDDs and support all the normal RDD operations. | |
teenNames = teenagers.map(lambda p: "Name: " + p.name) | |
for teenName in teenNames.collect(): | |
print teenName |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment