korkridake · November 23, 2018 07:36
diff --git a/PySpark_Tutorial_SummaryStatistics.py b/PySpark_Tutorial_SummaryStatistics.py
 spark
 # <pyspark.sql.session.SparkSession at 0x7f8df8673ba8>

 # -------------------------------------------------------------------------------
 # Import PySpark Libraries
 # -------------------------------------------------------------------------------
 from pyspark.sql.functions import skewness, kurtosis
 from pyspark.sql.functions import var_pop, var_samp, stddev, stddev_pop, sumDistinct, ntile
 from pyspark.sql.types import IntegerType
 from pyspark.sql.types import StringType
 from pyspark.sql import Row

 # -------------------------------------------------------------------------------
 # Load Data and Declare Variable Type
 # Resources
 # - https://stackoverflow.com/questions/46956026/how-to-convert-column-with-string-type-to-int-form-in-pyspark-data-frame
 # -------------------------------------------------------------------------------
 df = spark.read.csv('dbfs:/mnt/devilsci_mount/temp/iris_data.csv', 
                    header=True, 
                    inferSchema=True, 
                    sep=',')
 df = df.withColumn("target", df["target"].cast(StringType()))
 df.show(10)
 # +-----------------+----------------+-----------------+----------------+------+
 # |sepal length (cm)|sepal width (cm)|petal length (cm)|petal width (cm)|target|
 # +-----------------+----------------+-----------------+----------------+------+
 # |              5.1|             3.5|              1.4|             0.2|   0.0|
 # |              4.9|             3.0|              1.4|             0.2|   0.0|
 # |              4.7|             3.2|              1.3|             0.2|   0.0|
 # |              4.6|             3.1|              1.5|             0.2|   0.0|
 # |              5.0|             3.6|              1.4|             0.2|   0.0|
 # |              5.4|             3.9|              1.7|             0.4|   0.0|
 # |              4.6|             3.4|              1.4|             0.3|   0.0|
 # |              5.0|             3.4|              1.5|             0.2|   0.0|
 # |              4.4|             2.9|              1.4|             0.2|   0.0|
 # |              4.9|             3.1|              1.5|             0.1|   0.0|
 # +-----------------+----------------+-----------------+----------------+------+

 # Glimpse command in PySpark
 df.printSchema()
 # root
 # |-- sepal length (cm): double (nullable = true)
 # |-- sepal width (cm): double (nullable = true)
 # |-- petal length (cm): double (nullable = true)
 # |-- petal width (cm): double (nullable = true)
 # |-- target: string (nullable = true)

 df_described = df.describe()
 df_described.show()
 # +-------+------------------+-------------------+------------------+------------------+------------------+
 #   |summary| sepal length (cm)|   sepal width (cm)| petal length (cm)|  petal width (cm)|            target|
 #   +-------+------------------+-------------------+------------------+------------------+------------------+
 #   |  count|               150|                150|               150|               150|               150|
 #   |   mean| 5.843333333333335| 3.0540000000000007|3.7586666666666693|1.1986666666666672|               1.0|
 #   | stddev|0.8280661279778637|0.43359431136217375| 1.764420419952262|0.7631607417008414|0.8192319205190406|
 #   |    min|               4.3|                2.0|               1.0|               0.1|               0.0|
 #   |    max|               7.9|                4.4|               6.9|               2.5|               2.0|
 #   +-------+------------------+-------------------+------------------+------------------+------------------+

 df.select(skewness('sepal length (cm)')).show()
 # +---------------------------+
 #   |skewness(sepal length (cm))|
 #   +---------------------------+
 #   |        0.31175305850229657|
 #   +---------------------------+


 columns = df_described.columns  #list of column names: ['summary', '_c0', '_c3', '_c4', '_c5', '_c6']
 funcs   = [skewness, kurtosis]  #list of functions we want to include (imported earlier)
 fnames  = ['skew', 'kurtosis']  #a list of strings describing the functions in the same order

 def new_item(func, column):
    """
    This function takes in an aggregation function and a column name, then applies the aggregation to the
    column, collects it and returns a value.  The value is in string format despite being a number, 
    because that matches the output of describe.
    """
    return str(df.select(func(column)).collect()[0][0])

 new_data = []
 for func, fname in zip(funcs, fnames):
    row_dict = {'summary':fname}  #each row object begins with an entry for "summary"
    for column in columns[1:]:
        row_dict[column] = new_item(func, column)
    new_data.append(Row(**row_dict))  #using ** tells Python to unpack the entries of the dictionary
    
 print(new_data)
 # [Row(petal length (cm)='-0.27171195017163935', petal width (cm)='-0.1039436662675169', sepal length (cm)='0.31175305850229657', sepal width (cm)='0.3307028127733166', summary='skew', target='-4.079219866531552e-17'), Row(petal length (cm)='-1.3953593021397108', petal width (cm)='-1.3352456441311868', sepal length (cm)='-0.5735679489249783', sepal width (cm)='0.24144329938317943', summary='kurtosis', target='-1.5000000000000004')]

 df_described.collect()
 # [Row(summary='count', sepal length (cm)='150', sepal width (cm)='150', petal length (cm)='150', petal width (cm)='150', target='150'),
 #  Row(summary='mean', sepal length (cm)='5.843333333333335', sepal width (cm)='3.0540000000000007', petal length (cm)='3.7586666666666693', petal width (cm)='1.1986666666666672', target='1.0'),
 #  Row(summary='stddev', sepal length (cm)='0.8280661279778637', sepal width (cm)='0.43359431136217375', petal length (cm)='1.764420419952262', petal width (cm)='0.7631607417008414', target='0.8192319205190406'),
 #  Row(summary='min', sepal length (cm)='4.3', sepal width (cm)='2.0', petal length (cm)='1.0', petal width (cm)='0.1', target='0.0'),
 #  Row(summary='max', sepal length (cm)='7.9', sepal width (cm)='4.4', petal length (cm)='6.9', petal width (cm)='2.5', target='2.0')]

 new_describe = sc.parallelize(new_data).toDF()           #turns the results from our loop into a dataframe
 new_describe = new_describe.select(df_described.columns) #forces the columns into the same order

 expanded_describe = df_described.unionAll(new_describe)  #merges the new stats with the original describe
 expanded_describe.show()
 # +--------+-------------------+-------------------+--------------------+-------------------+--------------------+
 #   | summary|  sepal length (cm)|   sepal width (cm)|   petal length (cm)|   petal width (cm)|              target|
 #   +--------+-------------------+-------------------+--------------------+-------------------+--------------------+
 #   |   count|                150|                150|                 150|                150|                 150|
 #   |    mean|  5.843333333333335| 3.0540000000000007|  3.7586666666666693| 1.1986666666666672|                 1.0|
 #   |  stddev| 0.8280661279778637|0.43359431136217375|   1.764420419952262| 0.7631607417008414|  0.8192319205190406|
 #   |     min|                4.3|                2.0|                 1.0|                0.1|                 0.0|
 #   |     max|                7.9|                4.4|                 6.9|                2.5|                 2.0|
 #   |    skew|0.31175305850229657| 0.3307028127733166|-0.27171195017163935|-0.1039436662675169|-4.07921986653155...|
 #   |kurtosis|-0.5735679489249783|0.24144329938317943| -1.3953593021397108|-1.3352456441311868| -1.5000000000000004|
 #   +--------+-------------------+-------------------+--------------------+-------------------+--------------------+
	spark
	# <pyspark.sql.session.SparkSession at 0x7f8df8673ba8>

	# -------------------------------------------------------------------------------
	# Import PySpark Libraries
	# -------------------------------------------------------------------------------
	from pyspark.sql.functions import skewness, kurtosis
	from pyspark.sql.functions import var_pop, var_samp, stddev, stddev_pop, sumDistinct, ntile
	from pyspark.sql.types import IntegerType
	from pyspark.sql.types import StringType
	from pyspark.sql import Row

	# -------------------------------------------------------------------------------
	# Load Data and Declare Variable Type
	# Resources
	# - https://stackoverflow.com/questions/46956026/how-to-convert-column-with-string-type-to-int-form-in-pyspark-data-frame
	# -------------------------------------------------------------------------------
	df = spark.read.csv('dbfs:/mnt/devilsci_mount/temp/iris_data.csv',
	header=True,
	inferSchema=True,
	sep=',')
	df = df.withColumn("target", df["target"].cast(StringType()))
	df.show(10)
	# +-----------------+----------------+-----------------+----------------+------+
	# \|sepal length (cm)\|sepal width (cm)\|petal length (cm)\|petal width (cm)\|target\|
	# +-----------------+----------------+-----------------+----------------+------+
	# \| 5.1\| 3.5\| 1.4\| 0.2\| 0.0\|
	# \| 4.9\| 3.0\| 1.4\| 0.2\| 0.0\|
	# \| 4.7\| 3.2\| 1.3\| 0.2\| 0.0\|
	# \| 4.6\| 3.1\| 1.5\| 0.2\| 0.0\|
	# \| 5.0\| 3.6\| 1.4\| 0.2\| 0.0\|
	# \| 5.4\| 3.9\| 1.7\| 0.4\| 0.0\|
	# \| 4.6\| 3.4\| 1.4\| 0.3\| 0.0\|
	# \| 5.0\| 3.4\| 1.5\| 0.2\| 0.0\|
	# \| 4.4\| 2.9\| 1.4\| 0.2\| 0.0\|
	# \| 4.9\| 3.1\| 1.5\| 0.1\| 0.0\|
	# +-----------------+----------------+-----------------+----------------+------+

	# Glimpse command in PySpark
	df.printSchema()
	# root
	# \|-- sepal length (cm): double (nullable = true)
	# \|-- sepal width (cm): double (nullable = true)
	# \|-- petal length (cm): double (nullable = true)
	# \|-- petal width (cm): double (nullable = true)
	# \|-- target: string (nullable = true)

	df_described = df.describe()
	df_described.show()
	# +-------+------------------+-------------------+------------------+------------------+------------------+
	# \|summary\| sepal length (cm)\| sepal width (cm)\| petal length (cm)\| petal width (cm)\| target\|
	# +-------+------------------+-------------------+------------------+------------------+------------------+
	# \| count\| 150\| 150\| 150\| 150\| 150\|
	# \| mean\| 5.843333333333335\| 3.0540000000000007\|3.7586666666666693\|1.1986666666666672\| 1.0\|
	# \| stddev\|0.8280661279778637\|0.43359431136217375\| 1.764420419952262\|0.7631607417008414\|0.8192319205190406\|
	# \| min\| 4.3\| 2.0\| 1.0\| 0.1\| 0.0\|
	# \| max\| 7.9\| 4.4\| 6.9\| 2.5\| 2.0\|
	# +-------+------------------+-------------------+------------------+------------------+------------------+

	df.select(skewness('sepal length (cm)')).show()
	# +---------------------------+
	# \|skewness(sepal length (cm))\|
	# +---------------------------+
	# \| 0.31175305850229657\|
	# +---------------------------+


	columns = df_described.columns #list of column names: ['summary', '_c0', '_c3', '_c4', '_c5', '_c6']
	funcs = [skewness, kurtosis] #list of functions we want to include (imported earlier)
	fnames = ['skew', 'kurtosis'] #a list of strings describing the functions in the same order

	def new_item(func, column):
	"""
	This function takes in an aggregation function and a column name, then applies the aggregation to the
	column, collects it and returns a value. The value is in string format despite being a number,
	because that matches the output of describe.
	"""
	return str(df.select(func(column)).collect()[0][0])

	new_data = []
	for func, fname in zip(funcs, fnames):
	row_dict = {'summary':fname} #each row object begins with an entry for "summary"
	for column in columns[1:]:
	row_dict[column] = new_item(func, column)
	new_data.append(Row(row_dict)) #using tells Python to unpack the entries of the dictionary

	print(new_data)
	# [Row(petal length (cm)='-0.27171195017163935', petal width (cm)='-0.1039436662675169', sepal length (cm)='0.31175305850229657', sepal width (cm)='0.3307028127733166', summary='skew', target='-4.079219866531552e-17'), Row(petal length (cm)='-1.3953593021397108', petal width (cm)='-1.3352456441311868', sepal length (cm)='-0.5735679489249783', sepal width (cm)='0.24144329938317943', summary='kurtosis', target='-1.5000000000000004')]

	df_described.collect()
	# [Row(summary='count', sepal length (cm)='150', sepal width (cm)='150', petal length (cm)='150', petal width (cm)='150', target='150'),
	# Row(summary='mean', sepal length (cm)='5.843333333333335', sepal width (cm)='3.0540000000000007', petal length (cm)='3.7586666666666693', petal width (cm)='1.1986666666666672', target='1.0'),
	# Row(summary='stddev', sepal length (cm)='0.8280661279778637', sepal width (cm)='0.43359431136217375', petal length (cm)='1.764420419952262', petal width (cm)='0.7631607417008414', target='0.8192319205190406'),
	# Row(summary='min', sepal length (cm)='4.3', sepal width (cm)='2.0', petal length (cm)='1.0', petal width (cm)='0.1', target='0.0'),
	# Row(summary='max', sepal length (cm)='7.9', sepal width (cm)='4.4', petal length (cm)='6.9', petal width (cm)='2.5', target='2.0')]

	new_describe = sc.parallelize(new_data).toDF() #turns the results from our loop into a dataframe
	new_describe = new_describe.select(df_described.columns) #forces the columns into the same order

	expanded_describe = df_described.unionAll(new_describe) #merges the new stats with the original describe
	expanded_describe.show()
	# +--------+-------------------+-------------------+--------------------+-------------------+--------------------+
	# \| summary\| sepal length (cm)\| sepal width (cm)\| petal length (cm)\| petal width (cm)\| target\|
	# +--------+-------------------+-------------------+--------------------+-------------------+--------------------+
	# \| count\| 150\| 150\| 150\| 150\| 150\|
	# \| mean\| 5.843333333333335\| 3.0540000000000007\| 3.7586666666666693\| 1.1986666666666672\| 1.0\|
	# \| stddev\| 0.8280661279778637\|0.43359431136217375\| 1.764420419952262\| 0.7631607417008414\| 0.8192319205190406\|
	# \| min\| 4.3\| 2.0\| 1.0\| 0.1\| 0.0\|
	# \| max\| 7.9\| 4.4\| 6.9\| 2.5\| 2.0\|
	# \| skew\|0.31175305850229657\| 0.3307028127733166\|-0.27171195017163935\|-0.1039436662675169\|-4.07921986653155...\|
	# \|kurtosis\|-0.5735679489249783\|0.24144329938317943\| -1.3953593021397108\|-1.3352456441311868\| -1.5000000000000004\|
	# +--------+-------------------+-------------------+--------------------+-------------------+--------------------+