Skip to content

Instantly share code, notes, and snippets.

@jlln
Last active April 20, 2016 08:13
Show Gist options
  • Save jlln/5dd295a8754e6c4d700f700fe6800c85 to your computer and use it in GitHub Desktop.
Save jlln/5dd295a8754e6c4d700f700fe6800c85 to your computer and use it in GitHub Desktop.
Spark/Scala function for determining the fractions of examples falling into different groups, taking into account other grouping criteria.
def groupOutcomeFractions(df:DataFrame,outcome:String,outer_group_criteria:Seq[String]):DataFrame = {
df.registerTempTable("df")
val count_variable:String = outer_group_criteria.head
val inner_group_criteria = outer_group_criteria :+ outcome
val outer_group_query = "SELECT "+ outer_group_criteria.mkString(" , ") +s", COUNT($count_variable) AS outer_count FROM df GROUP BY " + outer_group_criteria.mkString(" , ")
val outer_count = sqlContext.sql(outer_group_query)
val inner_count_query = "SELECT "+ inner_group_criteria.mkString(" , ") +s", COUNT($count_variable) AS inner_count FROM df GROUP BY " + inner_group_criteria.mkString(" , ")
val inner_count = sqlContext.sql(inner_count_query)
val combined_counts = inner_count.join(outer_count,outer_group_criteria)
combined_counts.withColumn("Fraction",col("inner_count")/col("outer_count"))
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment