drj42 · August 29, 2015 14:22
diff --git a/gistfile1.py b/gistfile1.py
 """
 Launch pyspark with the flag:

 --packages com.databricks:spark-csv_2.10:1.0.3
 """ 
 from pyspark.sql import SQLContext
 from pyspark.sql.types import StructField, StringType, StructType

 def create_schema(fields):
    """ Create a dataframe schema from a list of field names.
    """
    schema = [StructField(str(field), StringType(), True) for field in fields]
    return StructType(schema)

 def load_csv(sq, path, delimiter=",", fields=None):
    """ Create a dataframe from a csv file.  If no fields are specified,
    it will assume headers are included in the file.
    """
    args = {
        'source': 'com.databricks.spark.csv',
        'path': path,
        'header': 'true',
        'delimiter': delimiter,
    }
    if fields:
        args['schema'] = create_schema(fields)
        args['header'] = "false"
    return sq.load(**args)


 # load a file with a header included
 csv_dataframe = load_csv(sqlContext, path)

 # load a file without a header, but apply your own schema
 csv_dataframe = load_csv(sqlContext, path, fields=['name','age'])
	"""
	Launch pyspark with the flag:

	--packages com.databricks:spark-csv_2.10:1.0.3
	"""
	from pyspark.sql import SQLContext
	from pyspark.sql.types import StructField, StringType, StructType

	def create_schema(fields):
	""" Create a dataframe schema from a list of field names.
	"""
	schema = [StructField(str(field), StringType(), True) for field in fields]
	return StructType(schema)

	def load_csv(sq, path, delimiter=",", fields=None):
	""" Create a dataframe from a csv file. If no fields are specified,
	it will assume headers are included in the file.
	"""
	args = {
	'source': 'com.databricks.spark.csv',
	'path': path,
	'header': 'true',
	'delimiter': delimiter,
	}
	if fields:
	args['schema'] = create_schema(fields)
	args['header'] = "false"
	return sq.load(**args)


	# load a file with a header included
	csv_dataframe = load_csv(sqlContext, path)

	# load a file without a header, but apply your own schema
	csv_dataframe = load_csv(sqlContext, path, fields=['name','age'])