pablete · November 10, 2014 07:10
diff --git a/gistfile1.py b/gistfile1.py
 # sc is an existing SparkContext.
 from pyspark.sql import SQLContext, Row
 sqlContext = SQLContext(sc)

 # Load a text file and convert each line to a dictionary.
 lines = sc.textFile("examples/src/main/resources/people.txt")
 parts = lines.map(lambda l: l.split(","))
 people = parts.map(lambda p: Row(name=p[0], age=int(p[1])))

 #if using CSV instead FROM https://docs.python.org/2/library/csv.html
 import csv
 with open('people.csv', 'rb') as csvfile:
  allpeople = csv.reader(csvfile, delimiter=' ', quotechar='|')
  for onepeople in allpeople
    people = onepeople.map(lambda p: Row(name=p[0], age=int(p[1])))
    
 # Infer the schema, and register the SchemaRDD as a table.
 schemaPeople = sqlContext.inferSchema(people)
 schemaPeople.registerTempTable("people")

 # SQL can be run over SchemaRDDs that have been registered as a table.
 teenagers = sqlContext.sql("SELECT name FROM people WHERE age >= 13 AND age <= 19")

 # The results of SQL queries are RDDs and support all the normal RDD operations.
 teenNames = teenagers.map(lambda p: "Name: " + p.name)
 for teenName in teenNames.collect():
  print teenName
	# sc is an existing SparkContext.
	from pyspark.sql import SQLContext, Row
	sqlContext = SQLContext(sc)

	# Load a text file and convert each line to a dictionary.
	lines = sc.textFile("examples/src/main/resources/people.txt")
	parts = lines.map(lambda l: l.split(","))
	people = parts.map(lambda p: Row(name=p[0], age=int(p[1])))

	#if using CSV instead FROM https://docs.python.org/2/library/csv.html
	import csv
	with open('people.csv', 'rb') as csvfile:
	allpeople = csv.reader(csvfile, delimiter=' ', quotechar='\|')
	for onepeople in allpeople
	people = onepeople.map(lambda p: Row(name=p[0], age=int(p[1])))

	# Infer the schema, and register the SchemaRDD as a table.
	schemaPeople = sqlContext.inferSchema(people)
	schemaPeople.registerTempTable("people")

	# SQL can be run over SchemaRDDs that have been registered as a table.
	teenagers = sqlContext.sql("SELECT name FROM people WHERE age >= 13 AND age <= 19")

	# The results of SQL queries are RDDs and support all the normal RDD operations.
	teenNames = teenagers.map(lambda p: "Name: " + p.name)
	for teenName in teenNames.collect():
	print teenName