Skip to content

Instantly share code, notes, and snippets.

@korkridake
Created November 23, 2018 07:36
Show Gist options
  • Save korkridake/4adba6ccb7dc824712f9d38ad307e355 to your computer and use it in GitHub Desktop.
Save korkridake/4adba6ccb7dc824712f9d38ad307e355 to your computer and use it in GitHub Desktop.
spark
# <pyspark.sql.session.SparkSession at 0x7f8df8673ba8>
# -------------------------------------------------------------------------------
# Import PySpark Libraries
# -------------------------------------------------------------------------------
from pyspark.sql.functions import skewness, kurtosis
from pyspark.sql.functions import var_pop, var_samp, stddev, stddev_pop, sumDistinct, ntile
from pyspark.sql.types import IntegerType
from pyspark.sql.types import StringType
from pyspark.sql import Row
# -------------------------------------------------------------------------------
# Load Data and Declare Variable Type
# Resources
# - https://stackoverflow.com/questions/46956026/how-to-convert-column-with-string-type-to-int-form-in-pyspark-data-frame
# -------------------------------------------------------------------------------
df = spark.read.csv('dbfs:/mnt/devilsci_mount/temp/iris_data.csv',
header=True,
inferSchema=True,
sep=',')
df = df.withColumn("target", df["target"].cast(StringType()))
df.show(10)
# +-----------------+----------------+-----------------+----------------+------+
# |sepal length (cm)|sepal width (cm)|petal length (cm)|petal width (cm)|target|
# +-----------------+----------------+-----------------+----------------+------+
# | 5.1| 3.5| 1.4| 0.2| 0.0|
# | 4.9| 3.0| 1.4| 0.2| 0.0|
# | 4.7| 3.2| 1.3| 0.2| 0.0|
# | 4.6| 3.1| 1.5| 0.2| 0.0|
# | 5.0| 3.6| 1.4| 0.2| 0.0|
# | 5.4| 3.9| 1.7| 0.4| 0.0|
# | 4.6| 3.4| 1.4| 0.3| 0.0|
# | 5.0| 3.4| 1.5| 0.2| 0.0|
# | 4.4| 2.9| 1.4| 0.2| 0.0|
# | 4.9| 3.1| 1.5| 0.1| 0.0|
# +-----------------+----------------+-----------------+----------------+------+
# Glimpse command in PySpark
df.printSchema()
# root
# |-- sepal length (cm): double (nullable = true)
# |-- sepal width (cm): double (nullable = true)
# |-- petal length (cm): double (nullable = true)
# |-- petal width (cm): double (nullable = true)
# |-- target: string (nullable = true)
df_described = df.describe()
df_described.show()
# +-------+------------------+-------------------+------------------+------------------+------------------+
# |summary| sepal length (cm)| sepal width (cm)| petal length (cm)| petal width (cm)| target|
# +-------+------------------+-------------------+------------------+------------------+------------------+
# | count| 150| 150| 150| 150| 150|
# | mean| 5.843333333333335| 3.0540000000000007|3.7586666666666693|1.1986666666666672| 1.0|
# | stddev|0.8280661279778637|0.43359431136217375| 1.764420419952262|0.7631607417008414|0.8192319205190406|
# | min| 4.3| 2.0| 1.0| 0.1| 0.0|
# | max| 7.9| 4.4| 6.9| 2.5| 2.0|
# +-------+------------------+-------------------+------------------+------------------+------------------+
df.select(skewness('sepal length (cm)')).show()
# +---------------------------+
# |skewness(sepal length (cm))|
# +---------------------------+
# | 0.31175305850229657|
# +---------------------------+
columns = df_described.columns #list of column names: ['summary', '_c0', '_c3', '_c4', '_c5', '_c6']
funcs = [skewness, kurtosis] #list of functions we want to include (imported earlier)
fnames = ['skew', 'kurtosis'] #a list of strings describing the functions in the same order
def new_item(func, column):
"""
This function takes in an aggregation function and a column name, then applies the aggregation to the
column, collects it and returns a value. The value is in string format despite being a number,
because that matches the output of describe.
"""
return str(df.select(func(column)).collect()[0][0])
new_data = []
for func, fname in zip(funcs, fnames):
row_dict = {'summary':fname} #each row object begins with an entry for "summary"
for column in columns[1:]:
row_dict[column] = new_item(func, column)
new_data.append(Row(**row_dict)) #using ** tells Python to unpack the entries of the dictionary
print(new_data)
# [Row(petal length (cm)='-0.27171195017163935', petal width (cm)='-0.1039436662675169', sepal length (cm)='0.31175305850229657', sepal width (cm)='0.3307028127733166', summary='skew', target='-4.079219866531552e-17'), Row(petal length (cm)='-1.3953593021397108', petal width (cm)='-1.3352456441311868', sepal length (cm)='-0.5735679489249783', sepal width (cm)='0.24144329938317943', summary='kurtosis', target='-1.5000000000000004')]
df_described.collect()
# [Row(summary='count', sepal length (cm)='150', sepal width (cm)='150', petal length (cm)='150', petal width (cm)='150', target='150'),
# Row(summary='mean', sepal length (cm)='5.843333333333335', sepal width (cm)='3.0540000000000007', petal length (cm)='3.7586666666666693', petal width (cm)='1.1986666666666672', target='1.0'),
# Row(summary='stddev', sepal length (cm)='0.8280661279778637', sepal width (cm)='0.43359431136217375', petal length (cm)='1.764420419952262', petal width (cm)='0.7631607417008414', target='0.8192319205190406'),
# Row(summary='min', sepal length (cm)='4.3', sepal width (cm)='2.0', petal length (cm)='1.0', petal width (cm)='0.1', target='0.0'),
# Row(summary='max', sepal length (cm)='7.9', sepal width (cm)='4.4', petal length (cm)='6.9', petal width (cm)='2.5', target='2.0')]
new_describe = sc.parallelize(new_data).toDF() #turns the results from our loop into a dataframe
new_describe = new_describe.select(df_described.columns) #forces the columns into the same order
expanded_describe = df_described.unionAll(new_describe) #merges the new stats with the original describe
expanded_describe.show()
# +--------+-------------------+-------------------+--------------------+-------------------+--------------------+
# | summary| sepal length (cm)| sepal width (cm)| petal length (cm)| petal width (cm)| target|
# +--------+-------------------+-------------------+--------------------+-------------------+--------------------+
# | count| 150| 150| 150| 150| 150|
# | mean| 5.843333333333335| 3.0540000000000007| 3.7586666666666693| 1.1986666666666672| 1.0|
# | stddev| 0.8280661279778637|0.43359431136217375| 1.764420419952262| 0.7631607417008414| 0.8192319205190406|
# | min| 4.3| 2.0| 1.0| 0.1| 0.0|
# | max| 7.9| 4.4| 6.9| 2.5| 2.0|
# | skew|0.31175305850229657| 0.3307028127733166|-0.27171195017163935|-0.1039436662675169|-4.07921986653155...|
# |kurtosis|-0.5735679489249783|0.24144329938317943| -1.3953593021397108|-1.3352456441311868| -1.5000000000000004|
# +--------+-------------------+-------------------+--------------------+-------------------+--------------------+
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment