Useful options:
Useful options:
case class Person(name: String, age: Int) | |
case class Couple(p1: String, p2: String) | |
def sameAgeCouples( | |
people: Dataset[Person], | |
couples: Dataset[Couple] | |
): Dataset[Couple] = | |
run { | |
for { | |
p1 <- liftQuery(people) |
def flatMap[U](f: T => RDD[U]): RDD[U] |
def flatMap[U](f: T => TraversableOnce[U]): RDD[U] |
SELECT x2.* | |
FROM | |
(SELECT LOWER (x2._1) _1, | |
COUNT(*) _2 | |
FROM | |
(SELECT explode(SPLIT(x1.text, ' ')) _1 | |
FROM (?) x1) x2 | |
WHERE x2._1 LIKE (concat('#', '%')) | |
GROUP BY LOWER (x2._1)) x10 | |
ORDER BY - (x2._2) ASC NULLS FIRST |
def topHashtags(tweets: RDD[Tweet], n: Int): Array[(String, BigInt)] = | |
tweets | |
.flatMap(_.text.split("\\s+")) // split it into words | |
.filter(_.startsWith("#")) // filter hashtag words | |
.map(_.toLowerCase) // normalize hashtags | |
.map((_, BigInt(1))) // create tuples for counting | |
.reduceByKey((a, b) => a + b) // accumulate counters | |
.top(n)(Ordering.by(_._2)) // return ordered top hashtags |
def topHashtags(tweets: DataFrame, n: Int): DataFrame = | |
tweets | |
.select(explode(split($"text", "\\s+"))) // split it into words | |
.select(lower($"col") as "word") // normalize hashtags | |
.filter("word like '#%'") // filter hashtag words | |
.groupBy($"word") // group by each hashtag | |
.agg(count("*") as "count") // aggregate the count | |
.orderBy($"count" desc) // order | |
.limit(n) // limit to top results |
def topHashtags(tweets: Dataset[Tweet], n: Int): Dataset[(String, BigInt)] = | |
tweets | |
.select($"text".as[String]) // select the text column (Dataframe) | |
.flatMap(_.split("\\s+")) // split it into words (Dataset) | |
.filter(_.startsWith("#")) // filter hashtag words (Dataset) | |
.map(_.toLowerCase) // normalize hashtags (Dataset) | |
.groupBy($"value") // group by each hashtag (Dataframe) | |
.agg(count("*") as "count") // aggregate the count (Dataframe) | |
.orderBy($"count" desc) // order (Datafeame) | |
.limit(n) // limit to top results (Dataframe) |
def topHashtags(tweets: Dataset[Tweet], n: Int): Dataset[(String, Long)] = | |
run { // produce a dataset from the Quill query | |
liftQuery(tweets) // trasform the dataset into a Quill query | |
.concatMap(_.text.split(" ")) // split into words and unnest results | |
.filter(_.startsWith("#")) // filter hashtag words | |
.map(_.toLowerCase) // normalize hashtags | |
.groupBy(word => word) // group by each hashtag | |
.map { // map word list to its count | |
case (word, list) => | |
(word, list.size) |
Please help us validate Quill 2.0.0-SNAPSHOT
so we can make the final release. Steps:
resolvers += Resolver.sonatypeRepo("snapshots")
2.0.0-SNAPSHOT
This version has 0️⃣ (yes, ZERO!) known bugs