Useful options:
Useful options:
| case class Person(name: String, age: Int) | |
| case class Couple(p1: String, p2: String) | |
| def sameAgeCouples( | |
| people: Dataset[Person], | |
| couples: Dataset[Couple] | |
| ): Dataset[Couple] = | |
| run { | |
| for { | |
| p1 <- liftQuery(people) |
| def flatMap[U](f: T => RDD[U]): RDD[U] |
| def flatMap[U](f: T => TraversableOnce[U]): RDD[U] |
| SELECT x2.* | |
| FROM | |
| (SELECT LOWER (x2._1) _1, | |
| COUNT(*) _2 | |
| FROM | |
| (SELECT explode(SPLIT(x1.text, ' ')) _1 | |
| FROM (?) x1) x2 | |
| WHERE x2._1 LIKE (concat('#', '%')) | |
| GROUP BY LOWER (x2._1)) x10 | |
| ORDER BY - (x2._2) ASC NULLS FIRST |
| def topHashtags(tweets: RDD[Tweet], n: Int): Array[(String, BigInt)] = | |
| tweets | |
| .flatMap(_.text.split("\\s+")) // split it into words | |
| .filter(_.startsWith("#")) // filter hashtag words | |
| .map(_.toLowerCase) // normalize hashtags | |
| .map((_, BigInt(1))) // create tuples for counting | |
| .reduceByKey((a, b) => a + b) // accumulate counters | |
| .top(n)(Ordering.by(_._2)) // return ordered top hashtags |
| def topHashtags(tweets: DataFrame, n: Int): DataFrame = | |
| tweets | |
| .select(explode(split($"text", "\\s+"))) // split it into words | |
| .select(lower($"col") as "word") // normalize hashtags | |
| .filter("word like '#%'") // filter hashtag words | |
| .groupBy($"word") // group by each hashtag | |
| .agg(count("*") as "count") // aggregate the count | |
| .orderBy($"count" desc) // order | |
| .limit(n) // limit to top results |
| def topHashtags(tweets: Dataset[Tweet], n: Int): Dataset[(String, BigInt)] = | |
| tweets | |
| .select($"text".as[String]) // select the text column (Dataframe) | |
| .flatMap(_.split("\\s+")) // split it into words (Dataset) | |
| .filter(_.startsWith("#")) // filter hashtag words (Dataset) | |
| .map(_.toLowerCase) // normalize hashtags (Dataset) | |
| .groupBy($"value") // group by each hashtag (Dataframe) | |
| .agg(count("*") as "count") // aggregate the count (Dataframe) | |
| .orderBy($"count" desc) // order (Datafeame) | |
| .limit(n) // limit to top results (Dataframe) |
| def topHashtags(tweets: Dataset[Tweet], n: Int): Dataset[(String, Long)] = | |
| run { // produce a dataset from the Quill query | |
| liftQuery(tweets) // trasform the dataset into a Quill query | |
| .concatMap(_.text.split(" ")) // split into words and unnest results | |
| .filter(_.startsWith("#")) // filter hashtag words | |
| .map(_.toLowerCase) // normalize hashtags | |
| .groupBy(word => word) // group by each hashtag | |
| .map { // map word list to its count | |
| case (word, list) => | |
| (word, list.size) |
Please help us validate Quill 2.0.0-SNAPSHOT so we can make the final release. Steps:
resolvers += Resolver.sonatypeRepo("snapshots")2.0.0-SNAPSHOTThis version has 0️⃣ (yes, ZERO!) known bugs