Flavio Brasil fwbrasil

Benchmarking

Useful options:

Please help us validate Quill 2.0.0-SNAPSHOT so we can make the final release. Steps:

Read the migration notes
Add the snapshots repo to your sbt build: resolvers += Resolver.sonatypeRepo("snapshots")
Update the Quill version to 2.0.0-SNAPSHOT
Fix the compilation errors, feel free to ask questions on the gitter channel
Let us know the results

This version has 0️⃣ (yes, ZERO!) known bugs

	case class Person(name: String, age: Int)
	case class Couple(p1: String, p2: String)

	def sameAgeCouples(
	people: Dataset[Person],
	couples: Dataset[Couple]
	): Dataset[Couple] =
	run {
	for {
	p1 <- liftQuery(people)

	def topHashtags(tweets: RDD[Tweet], n: Int): Array[(String, BigInt)] =
	tweets
	.flatMap(_.text.split("\\s+")) // split it into words
	.filter(_.startsWith("#")) // filter hashtag words
	.map(_.toLowerCase) // normalize hashtags
	.map((_, BigInt(1))) // create tuples for counting
	.reduceByKey((a, b) => a + b) // accumulate counters
	.top(n)(Ordering.by(_._2)) // return ordered top hashtags

	def topHashtags(tweets: DataFrame, n: Int): DataFrame =
	tweets
	.select(explode(split($"text", "\\s+"))) // split it into words
	.select(lower($"col") as "word") // normalize hashtags
	.filter("word like '#%'") // filter hashtag words
	.groupBy($"word") // group by each hashtag
	.agg(count("*") as "count") // aggregate the count
	.orderBy($"count" desc) // order
	.limit(n) // limit to top results

	def topHashtags(tweets: Dataset[Tweet], n: Int): Dataset[(String, BigInt)] =
	tweets
	.select($"text".as[String]) // select the text column (Dataframe)
	.flatMap(_.split("\\s+")) // split it into words (Dataset)
	.filter(_.startsWith("#")) // filter hashtag words (Dataset)
	.map(_.toLowerCase) // normalize hashtags (Dataset)
	.groupBy($"value") // group by each hashtag (Dataframe)
	.agg(count("*") as "count") // aggregate the count (Dataframe)
	.orderBy($"count" desc) // order (Datafeame)
	.limit(n) // limit to top results (Dataframe)

	def topHashtags(tweets: Dataset[Tweet], n: Int): Dataset[(String, Long)] =
	run { // produce a dataset from the Quill query
	liftQuery(tweets) // trasform the dataset into a Quill query
	.concatMap(_.text.split(" ")) // split into words and unnest results
	.filter(_.startsWith("#")) // filter hashtag words
	.map(_.toLowerCase) // normalize hashtags
	.groupBy(word => word) // group by each hashtag
	.map { // map word list to its count
	case (word, list) =>
	(word, list.size)