Created
November 23, 2018 07:36
-
-
Save korkridake/4adba6ccb7dc824712f9d38ad307e355 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
spark | |
# <pyspark.sql.session.SparkSession at 0x7f8df8673ba8> | |
# ------------------------------------------------------------------------------- | |
# Import PySpark Libraries | |
# ------------------------------------------------------------------------------- | |
from pyspark.sql.functions import skewness, kurtosis | |
from pyspark.sql.functions import var_pop, var_samp, stddev, stddev_pop, sumDistinct, ntile | |
from pyspark.sql.types import IntegerType | |
from pyspark.sql.types import StringType | |
from pyspark.sql import Row | |
# ------------------------------------------------------------------------------- | |
# Load Data and Declare Variable Type | |
# Resources | |
# - https://stackoverflow.com/questions/46956026/how-to-convert-column-with-string-type-to-int-form-in-pyspark-data-frame | |
# ------------------------------------------------------------------------------- | |
df = spark.read.csv('dbfs:/mnt/devilsci_mount/temp/iris_data.csv', | |
header=True, | |
inferSchema=True, | |
sep=',') | |
df = df.withColumn("target", df["target"].cast(StringType())) | |
df.show(10) | |
# +-----------------+----------------+-----------------+----------------+------+ | |
# |sepal length (cm)|sepal width (cm)|petal length (cm)|petal width (cm)|target| | |
# +-----------------+----------------+-----------------+----------------+------+ | |
# | 5.1| 3.5| 1.4| 0.2| 0.0| | |
# | 4.9| 3.0| 1.4| 0.2| 0.0| | |
# | 4.7| 3.2| 1.3| 0.2| 0.0| | |
# | 4.6| 3.1| 1.5| 0.2| 0.0| | |
# | 5.0| 3.6| 1.4| 0.2| 0.0| | |
# | 5.4| 3.9| 1.7| 0.4| 0.0| | |
# | 4.6| 3.4| 1.4| 0.3| 0.0| | |
# | 5.0| 3.4| 1.5| 0.2| 0.0| | |
# | 4.4| 2.9| 1.4| 0.2| 0.0| | |
# | 4.9| 3.1| 1.5| 0.1| 0.0| | |
# +-----------------+----------------+-----------------+----------------+------+ | |
# Glimpse command in PySpark | |
df.printSchema() | |
# root | |
# |-- sepal length (cm): double (nullable = true) | |
# |-- sepal width (cm): double (nullable = true) | |
# |-- petal length (cm): double (nullable = true) | |
# |-- petal width (cm): double (nullable = true) | |
# |-- target: string (nullable = true) | |
df_described = df.describe() | |
df_described.show() | |
# +-------+------------------+-------------------+------------------+------------------+------------------+ | |
# |summary| sepal length (cm)| sepal width (cm)| petal length (cm)| petal width (cm)| target| | |
# +-------+------------------+-------------------+------------------+------------------+------------------+ | |
# | count| 150| 150| 150| 150| 150| | |
# | mean| 5.843333333333335| 3.0540000000000007|3.7586666666666693|1.1986666666666672| 1.0| | |
# | stddev|0.8280661279778637|0.43359431136217375| 1.764420419952262|0.7631607417008414|0.8192319205190406| | |
# | min| 4.3| 2.0| 1.0| 0.1| 0.0| | |
# | max| 7.9| 4.4| 6.9| 2.5| 2.0| | |
# +-------+------------------+-------------------+------------------+------------------+------------------+ | |
df.select(skewness('sepal length (cm)')).show() | |
# +---------------------------+ | |
# |skewness(sepal length (cm))| | |
# +---------------------------+ | |
# | 0.31175305850229657| | |
# +---------------------------+ | |
columns = df_described.columns #list of column names: ['summary', '_c0', '_c3', '_c4', '_c5', '_c6'] | |
funcs = [skewness, kurtosis] #list of functions we want to include (imported earlier) | |
fnames = ['skew', 'kurtosis'] #a list of strings describing the functions in the same order | |
def new_item(func, column): | |
""" | |
This function takes in an aggregation function and a column name, then applies the aggregation to the | |
column, collects it and returns a value. The value is in string format despite being a number, | |
because that matches the output of describe. | |
""" | |
return str(df.select(func(column)).collect()[0][0]) | |
new_data = [] | |
for func, fname in zip(funcs, fnames): | |
row_dict = {'summary':fname} #each row object begins with an entry for "summary" | |
for column in columns[1:]: | |
row_dict[column] = new_item(func, column) | |
new_data.append(Row(**row_dict)) #using ** tells Python to unpack the entries of the dictionary | |
print(new_data) | |
# [Row(petal length (cm)='-0.27171195017163935', petal width (cm)='-0.1039436662675169', sepal length (cm)='0.31175305850229657', sepal width (cm)='0.3307028127733166', summary='skew', target='-4.079219866531552e-17'), Row(petal length (cm)='-1.3953593021397108', petal width (cm)='-1.3352456441311868', sepal length (cm)='-0.5735679489249783', sepal width (cm)='0.24144329938317943', summary='kurtosis', target='-1.5000000000000004')] | |
df_described.collect() | |
# [Row(summary='count', sepal length (cm)='150', sepal width (cm)='150', petal length (cm)='150', petal width (cm)='150', target='150'), | |
# Row(summary='mean', sepal length (cm)='5.843333333333335', sepal width (cm)='3.0540000000000007', petal length (cm)='3.7586666666666693', petal width (cm)='1.1986666666666672', target='1.0'), | |
# Row(summary='stddev', sepal length (cm)='0.8280661279778637', sepal width (cm)='0.43359431136217375', petal length (cm)='1.764420419952262', petal width (cm)='0.7631607417008414', target='0.8192319205190406'), | |
# Row(summary='min', sepal length (cm)='4.3', sepal width (cm)='2.0', petal length (cm)='1.0', petal width (cm)='0.1', target='0.0'), | |
# Row(summary='max', sepal length (cm)='7.9', sepal width (cm)='4.4', petal length (cm)='6.9', petal width (cm)='2.5', target='2.0')] | |
new_describe = sc.parallelize(new_data).toDF() #turns the results from our loop into a dataframe | |
new_describe = new_describe.select(df_described.columns) #forces the columns into the same order | |
expanded_describe = df_described.unionAll(new_describe) #merges the new stats with the original describe | |
expanded_describe.show() | |
# +--------+-------------------+-------------------+--------------------+-------------------+--------------------+ | |
# | summary| sepal length (cm)| sepal width (cm)| petal length (cm)| petal width (cm)| target| | |
# +--------+-------------------+-------------------+--------------------+-------------------+--------------------+ | |
# | count| 150| 150| 150| 150| 150| | |
# | mean| 5.843333333333335| 3.0540000000000007| 3.7586666666666693| 1.1986666666666672| 1.0| | |
# | stddev| 0.8280661279778637|0.43359431136217375| 1.764420419952262| 0.7631607417008414| 0.8192319205190406| | |
# | min| 4.3| 2.0| 1.0| 0.1| 0.0| | |
# | max| 7.9| 4.4| 6.9| 2.5| 2.0| | |
# | skew|0.31175305850229657| 0.3307028127733166|-0.27171195017163935|-0.1039436662675169|-4.07921986653155...| | |
# |kurtosis|-0.5735679489249783|0.24144329938317943| -1.3953593021397108|-1.3352456441311868| -1.5000000000000004| | |
# +--------+-------------------+-------------------+--------------------+-------------------+--------------------+ |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment