Skip to content

Instantly share code, notes, and snippets.

@marcromeyn
Last active August 29, 2015 14:23
Show Gist options
  • Save marcromeyn/083187aec94a035868e7 to your computer and use it in GitHub Desktop.
Save marcromeyn/083187aec94a035868e7 to your computer and use it in GitHub Desktop.
Zeppelin Notebook
// Cell 1:
import sys.process._
import org.json4s.jackson.JsonMethods._
import org.json4s.native.Serialization._
import org.json4s.{DefaultFormats, _}
import org.joda.time.format.DateTimeFormat
import org.joda.time.DateTime
val jobQuery = "*"
val path = "/Users/marcromeyn/Projects/HackerNewsTrends/analysis/articles/"
val query = path + "HN-stories-" + jobQuery
// Input models
case class WebPage(title: String, metaDescription: String, metaKeywords: String, cleanedText: String, finalUrl: String, topImage: String)
case class HNItem(title: String, created_at: String, url: String, author: String, points: Int, story_text: String,
num_comments: Int, created_at_i: Int, objectID: String)
case class Item(webpage: WebPage, HNItem: HNItem)
case class Table(title: String, year: String, month: String, day: String, metaDescription: String, metaKeywords: String, cleanedText: String,
finalUrl: String, topImage: String, url: String, author: String, points: Int,
story_text: String, num_comments: Int, created_at_i: Int, objectID: String)
// Load all the files generated by the crawler and split each line (each line contains one article)
// Get rid of weird characters and filter all the empty artciles (the ones that the crawler couldn't fetch)
val corpus = sc.wholeTextFiles(query).flatMap { case (_, file) =>
file.split("\n").map {
implicit val formats = DefaultFormats // Brings in default date formats etc.
parse(_).extract[Item]
}.filter(_.webpage.cleanedText != "").map { x =>
val web = x.webpage
val item = x.HNItem
val str = item.created_at.split("T")(0).split("-")
val year = str(0)
val month = str(0) + "-" + str(1)
val day = str(0) + "-" + str(1) + "-" + str(2)
Table(item.title, year, month, day, web.metaDescription, web.metaKeywords, web.cleanedText,
web.finalUrl, web.topImage, item.url, item.author, item.points,
item.story_text, item.num_comments, item.created_at_i, item.objectID)
}
}
val table = corpus.toDF()
table.registerTempTable("table")
// END CELL 1
// CELL 2
// Output models
case class wtv_Clustering(objectID: String, topicId: Int)
case class TopicStrings(id: Int, words: String)
case class Topic(id: Int, word: String, probability: Double)
val jobId = 17
val wtv_clustering = sc.textFile(s"/Users/marcromeyn/Projects/HackerNewsTrends/analysis/output/WordToVec/2015_$jobId/clustering")
val wtvTopics = sc.textFile(s"/Users/marcromeyn/Projects/HackerNewsTrends/analysis/output/WordToVec/2015_$jobId/topics")
val wtvTopicStrings = wtvTopics.map { row =>
val items = row.replace("(", "").replace(")", "").split(",")
(items(0).toInt, items(1))
}.reduceByKey(_ + " " + _).map(x => TopicStrings(x._1, x._2)).toDF()
wtvTopicStrings.registerTempTable("wtvTopicStrings")
val wtvTopicsTable = wtv_topics.map { row =>
val items = row.replace("(", "").replace(")", "").split(",")
Topic(items(0).toInt, items(1), items(2).toDouble)
}.toDF()
wtvTopicsTable.registerTempTable("wtvTopicsTable")
val wtvClusteringTable = wtv_clustering.map { row =>
val items = row.replace("(", "").replace(")", "").split(",")
wtv_Clustering(items(0), items(1).toInt)
}.toDF()
wtvClusteringTable.registerTempTable("wtvClusteringTable")
val wtv_joined_1 = wtvClusteringTable.join(table, table("objectID") === wtvClusteringTable("objectID"))
val wtvJoined = wtv_joined_1.join(wtvTopicStrings, wtvTopicStrings("id") === wtv_joined_1("topicId") )
wtvJoined.registerTempTable("wtvJoined")
// END CELL 2
// SQL EXAMPLES
%sql
select title, url, num_comments, points
from wtvJoined
where topicId = 79
order by month desc
limit 500
%sql
select *
from wtvTopicStrings
order by id
%sql
select topicId, count(1) Count
from wtvJoined
group by topicId
%sql
select month, sum(num_comments) Comments, count(1) Articles, sum(points) Points
from wtvJoined
where topicId = 79
group by month
order by month
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment