Last active
April 20, 2016 08:13
-
-
Save jlln/5dd295a8754e6c4d700f700fe6800c85 to your computer and use it in GitHub Desktop.
Spark/Scala function for determining the fractions of examples falling into different groups, taking into account other grouping criteria.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def groupOutcomeFractions(df:DataFrame,outcome:String,outer_group_criteria:Seq[String]):DataFrame = { | |
df.registerTempTable("df") | |
val count_variable:String = outer_group_criteria.head | |
val inner_group_criteria = outer_group_criteria :+ outcome | |
val outer_group_query = "SELECT "+ outer_group_criteria.mkString(" , ") +s", COUNT($count_variable) AS outer_count FROM df GROUP BY " + outer_group_criteria.mkString(" , ") | |
val outer_count = sqlContext.sql(outer_group_query) | |
val inner_count_query = "SELECT "+ inner_group_criteria.mkString(" , ") +s", COUNT($count_variable) AS inner_count FROM df GROUP BY " + inner_group_criteria.mkString(" , ") | |
val inner_count = sqlContext.sql(inner_count_query) | |
val combined_counts = inner_count.join(outer_count,outer_group_criteria) | |
combined_counts.withColumn("Fraction",col("inner_count")/col("outer_count")) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment