Last active
August 29, 2015 14:23
-
-
Save marcromeyn/083187aec94a035868e7 to your computer and use it in GitHub Desktop.
Zeppelin Notebook
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| // Cell 1: | |
| import sys.process._ | |
| import org.json4s.jackson.JsonMethods._ | |
| import org.json4s.native.Serialization._ | |
| import org.json4s.{DefaultFormats, _} | |
| import org.joda.time.format.DateTimeFormat | |
| import org.joda.time.DateTime | |
| val jobQuery = "*" | |
| val path = "/Users/marcromeyn/Projects/HackerNewsTrends/analysis/articles/" | |
| val query = path + "HN-stories-" + jobQuery | |
| // Input models | |
| case class WebPage(title: String, metaDescription: String, metaKeywords: String, cleanedText: String, finalUrl: String, topImage: String) | |
| case class HNItem(title: String, created_at: String, url: String, author: String, points: Int, story_text: String, | |
| num_comments: Int, created_at_i: Int, objectID: String) | |
| case class Item(webpage: WebPage, HNItem: HNItem) | |
| case class Table(title: String, year: String, month: String, day: String, metaDescription: String, metaKeywords: String, cleanedText: String, | |
| finalUrl: String, topImage: String, url: String, author: String, points: Int, | |
| story_text: String, num_comments: Int, created_at_i: Int, objectID: String) | |
| // Load all the files generated by the crawler and split each line (each line contains one article) | |
| // Get rid of weird characters and filter all the empty artciles (the ones that the crawler couldn't fetch) | |
| val corpus = sc.wholeTextFiles(query).flatMap { case (_, file) => | |
| file.split("\n").map { | |
| implicit val formats = DefaultFormats // Brings in default date formats etc. | |
| parse(_).extract[Item] | |
| }.filter(_.webpage.cleanedText != "").map { x => | |
| val web = x.webpage | |
| val item = x.HNItem | |
| val str = item.created_at.split("T")(0).split("-") | |
| val year = str(0) | |
| val month = str(0) + "-" + str(1) | |
| val day = str(0) + "-" + str(1) + "-" + str(2) | |
| Table(item.title, year, month, day, web.metaDescription, web.metaKeywords, web.cleanedText, | |
| web.finalUrl, web.topImage, item.url, item.author, item.points, | |
| item.story_text, item.num_comments, item.created_at_i, item.objectID) | |
| } | |
| } | |
| val table = corpus.toDF() | |
| table.registerTempTable("table") | |
| // END CELL 1 | |
| // CELL 2 | |
| // Output models | |
| case class wtv_Clustering(objectID: String, topicId: Int) | |
| case class TopicStrings(id: Int, words: String) | |
| case class Topic(id: Int, word: String, probability: Double) | |
| val jobId = 17 | |
| val wtv_clustering = sc.textFile(s"/Users/marcromeyn/Projects/HackerNewsTrends/analysis/output/WordToVec/2015_$jobId/clustering") | |
| val wtvTopics = sc.textFile(s"/Users/marcromeyn/Projects/HackerNewsTrends/analysis/output/WordToVec/2015_$jobId/topics") | |
| val wtvTopicStrings = wtvTopics.map { row => | |
| val items = row.replace("(", "").replace(")", "").split(",") | |
| (items(0).toInt, items(1)) | |
| }.reduceByKey(_ + " " + _).map(x => TopicStrings(x._1, x._2)).toDF() | |
| wtvTopicStrings.registerTempTable("wtvTopicStrings") | |
| val wtvTopicsTable = wtv_topics.map { row => | |
| val items = row.replace("(", "").replace(")", "").split(",") | |
| Topic(items(0).toInt, items(1), items(2).toDouble) | |
| }.toDF() | |
| wtvTopicsTable.registerTempTable("wtvTopicsTable") | |
| val wtvClusteringTable = wtv_clustering.map { row => | |
| val items = row.replace("(", "").replace(")", "").split(",") | |
| wtv_Clustering(items(0), items(1).toInt) | |
| }.toDF() | |
| wtvClusteringTable.registerTempTable("wtvClusteringTable") | |
| val wtv_joined_1 = wtvClusteringTable.join(table, table("objectID") === wtvClusteringTable("objectID")) | |
| val wtvJoined = wtv_joined_1.join(wtvTopicStrings, wtvTopicStrings("id") === wtv_joined_1("topicId") ) | |
| wtvJoined.registerTempTable("wtvJoined") | |
| // END CELL 2 | |
| // SQL EXAMPLES | |
| %sql | |
| select title, url, num_comments, points | |
| from wtvJoined | |
| where topicId = 79 | |
| order by month desc | |
| limit 500 | |
| %sql | |
| select * | |
| from wtvTopicStrings | |
| order by id | |
| %sql | |
| select topicId, count(1) Count | |
| from wtvJoined | |
| group by topicId | |
| %sql | |
| select month, sum(num_comments) Comments, count(1) Articles, sum(points) Points | |
| from wtvJoined | |
| where topicId = 79 | |
| group by month | |
| order by month |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment