This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/** | |
* Create a class, CatTuple, to pass to the ANOVA function so that columns can be referred to by specific names. | |
* Create a class, ANOVAStats, that will be returned from the ANOVA function so that its outputs can be selected and referred to by name. | |
**/ | |
final case class CatTuple(cat: String, value: Double) | |
final case class ANOVAStats(dfb: Long, dfw: Double, F_value: Double, etaSq: Double, omegaSq: Double) | |
// Column names to use when converting to CatTuple | |
val colnames = Seq("cat", "value") |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Create a class, ScaleTuple, to pass to the Pearson's R function so that columns can be referred to by specific names. | |
final case class ScaleTuple(var1: Double, var2: Double) | |
// Column names to use when converting to ScaleTuple | |
val colnames = Seq("var1", "var2") | |
/** | |
* Implementation of Pearson's R function: calculates r, the measurement of linear dependence between two variables | |
* Utilizes DataSet's 'agg' function | |
**/ |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from pyspark.sql.functions import * | |
# Implementation of ANOVA function: calculates the degrees of freedom, F-value, eta squared and omega squared values. | |
# Expects that 'categoryData' with two columns, the first being the categorical independent variable and the second being the scale dependent variable | |
def getAnovaStats(categoryData) : | |
cat_val = categoryData.toDF("cat","value") | |
cat_val.createOrReplaceTempView("df") | |
newdf = spark.sql("select A.cat, A.value, cast((A.value * A.value) as double) as valueSq, ((A.value - B.avg) * (A.value - B.avg)) as diffSq from df A join (select cat, avg(value) as avg from df group by cat) B where A.cat = B.cat") | |
grouped = newdf.groupBy("cat") |