marcromeyn · August 29, 2015 14:23
diff --git a/gistfile1.scala b/gistfile1.scala
 // Cell 1:

 import sys.process._
 import org.json4s.jackson.JsonMethods._
 import org.json4s.native.Serialization._
 import org.json4s.{DefaultFormats, _}
 import org.joda.time.format.DateTimeFormat
 import org.joda.time.DateTime

 val jobQuery = "*"
 val path = "/Users/marcromeyn/Projects/HackerNewsTrends/analysis/articles/"
 val query = path + "HN-stories-" + jobQuery

 // Input models
 case class WebPage(title: String, metaDescription: String, metaKeywords: String, cleanedText: String, finalUrl: String, topImage: String)

 case class HNItem(title: String, created_at: String, url: String, author: String, points: Int, story_text: String,
                num_comments: Int, created_at_i: Int, objectID: String)

 case class Item(webpage: WebPage, HNItem: HNItem)

 case class Table(title: String, year: String, month: String, day: String, metaDescription: String, metaKeywords: String, cleanedText: String,
                   finalUrl: String, topImage: String, url: String, author: String, points: Int,
                   story_text: String, num_comments: Int, created_at_i: Int, objectID: String)
                   
                                      
                   
 // Load all the files generated by the crawler and split each line (each line contains one article)
 // Get rid of weird characters and filter all the empty artciles (the ones that the crawler couldn't fetch)
 val corpus = sc.wholeTextFiles(query).flatMap { case (_, file) =>
    file.split("\n").map { 
        implicit val formats = DefaultFormats // Brings in default date formats etc.
        
        parse(_).extract[Item]
    }.filter(_.webpage.cleanedText != "").map { x =>
      val web = x.webpage
      val item = x.HNItem
      
      val str = item.created_at.split("T")(0).split("-")
      val year = str(0)
      val month = str(0) + "-" + str(1)
      val day = str(0) + "-" + str(1) + "-" + str(2)
      
      Table(item.title, year, month, day, web.metaDescription, web.metaKeywords, web.cleanedText, 
        web.finalUrl, web.topImage, item.url, item.author, item.points, 
        item.story_text, item.num_comments, item.created_at_i, item.objectID)
    }
 }

 val table = corpus.toDF()
 table.registerTempTable("table")

 // END CELL 1

 // CELL 2

 // Output models
 case class wtv_Clustering(objectID: String, topicId: Int)
 case class TopicStrings(id: Int, words: String)
 case class Topic(id: Int, word: String, probability: Double)

 val jobId = 17

 val wtv_clustering = sc.textFile(s"/Users/marcromeyn/Projects/HackerNewsTrends/analysis/output/WordToVec/2015_$jobId/clustering")
 val wtvTopics = sc.textFile(s"/Users/marcromeyn/Projects/HackerNewsTrends/analysis/output/WordToVec/2015_$jobId/topics")

 val wtvTopicStrings = wtvTopics.map { row =>
    val items = row.replace("(", "").replace(")", "").split(",")
    (items(0).toInt, items(1))
  }.reduceByKey(_ + " " + _).map(x => TopicStrings(x._1, x._2)).toDF()
  
 wtvTopicStrings.registerTempTable("wtvTopicStrings")

 val wtvTopicsTable = wtv_topics.map { row =>
  val items = row.replace("(", "").replace(")", "").split(",")
  
  Topic(items(0).toInt, items(1), items(2).toDouble)
 }.toDF()
 wtvTopicsTable.registerTempTable("wtvTopicsTable")

 val wtvClusteringTable = wtv_clustering.map { row =>
    val items = row.replace("(", "").replace(")", "").split(",")

    wtv_Clustering(items(0), items(1).toInt) 
 }.toDF()
 wtvClusteringTable.registerTempTable("wtvClusteringTable")

  
 val wtv_joined_1 = wtvClusteringTable.join(table, table("objectID") === wtvClusteringTable("objectID"))
 val wtvJoined = wtv_joined_1.join(wtvTopicStrings, wtvTopicStrings("id") === wtv_joined_1("topicId") )
 wtvJoined.registerTempTable("wtvJoined")

 // END CELL 2

 // SQL EXAMPLES

 %sql
 select title, url, num_comments, points
 from wtvJoined
 where topicId = 79
 order by month desc
 limit 500

 %sql
 select *
 from wtvTopicStrings
 order by id

 %sql
 select topicId, count(1) Count
 from wtvJoined
 group by topicId

 %sql
 select month, sum(num_comments) Comments, count(1) Articles, sum(points) Points
 from wtvJoined
 where topicId = 79
 group by month
 order by month
	// Cell 1:

	import sys.process._
	import org.json4s.jackson.JsonMethods._
	import org.json4s.native.Serialization._
	import org.json4s.{DefaultFormats, _}
	import org.joda.time.format.DateTimeFormat
	import org.joda.time.DateTime

	val jobQuery = "*"
	val path = "/Users/marcromeyn/Projects/HackerNewsTrends/analysis/articles/"
	val query = path + "HN-stories-" + jobQuery

	// Input models
	case class WebPage(title: String, metaDescription: String, metaKeywords: String, cleanedText: String, finalUrl: String, topImage: String)

	case class HNItem(title: String, created_at: String, url: String, author: String, points: Int, story_text: String,
	num_comments: Int, created_at_i: Int, objectID: String)

	case class Item(webpage: WebPage, HNItem: HNItem)

	case class Table(title: String, year: String, month: String, day: String, metaDescription: String, metaKeywords: String, cleanedText: String,
	finalUrl: String, topImage: String, url: String, author: String, points: Int,
	story_text: String, num_comments: Int, created_at_i: Int, objectID: String)



	// Load all the files generated by the crawler and split each line (each line contains one article)
	// Get rid of weird characters and filter all the empty artciles (the ones that the crawler couldn't fetch)
	val corpus = sc.wholeTextFiles(query).flatMap { case (_, file) =>
	file.split("\n").map {
	implicit val formats = DefaultFormats // Brings in default date formats etc.

	parse(_).extract[Item]
	}.filter(_.webpage.cleanedText != "").map { x =>
	val web = x.webpage
	val item = x.HNItem

	val str = item.created_at.split("T")(0).split("-")
	val year = str(0)
	val month = str(0) + "-" + str(1)
	val day = str(0) + "-" + str(1) + "-" + str(2)

	Table(item.title, year, month, day, web.metaDescription, web.metaKeywords, web.cleanedText,
	web.finalUrl, web.topImage, item.url, item.author, item.points,
	item.story_text, item.num_comments, item.created_at_i, item.objectID)
	}
	}

	val table = corpus.toDF()
	table.registerTempTable("table")

	// END CELL 1

	// CELL 2

	// Output models
	case class wtv_Clustering(objectID: String, topicId: Int)
	case class TopicStrings(id: Int, words: String)
	case class Topic(id: Int, word: String, probability: Double)

	val jobId = 17

	val wtv_clustering = sc.textFile(s"/Users/marcromeyn/Projects/HackerNewsTrends/analysis/output/WordToVec/2015_$jobId/clustering")
	val wtvTopics = sc.textFile(s"/Users/marcromeyn/Projects/HackerNewsTrends/analysis/output/WordToVec/2015_$jobId/topics")

	val wtvTopicStrings = wtvTopics.map { row =>
	val items = row.replace("(", "").replace(")", "").split(",")
	(items(0).toInt, items(1))
	}.reduceByKey(_ + " " + _).map(x => TopicStrings(x._1, x._2)).toDF()

	wtvTopicStrings.registerTempTable("wtvTopicStrings")

	val wtvTopicsTable = wtv_topics.map { row =>
	val items = row.replace("(", "").replace(")", "").split(",")

	Topic(items(0).toInt, items(1), items(2).toDouble)
	}.toDF()
	wtvTopicsTable.registerTempTable("wtvTopicsTable")

	val wtvClusteringTable = wtv_clustering.map { row =>
	val items = row.replace("(", "").replace(")", "").split(",")

	wtv_Clustering(items(0), items(1).toInt)
	}.toDF()
	wtvClusteringTable.registerTempTable("wtvClusteringTable")


	val wtv_joined_1 = wtvClusteringTable.join(table, table("objectID") === wtvClusteringTable("objectID"))
	val wtvJoined = wtv_joined_1.join(wtvTopicStrings, wtvTopicStrings("id") === wtv_joined_1("topicId") )
	wtvJoined.registerTempTable("wtvJoined")

	// END CELL 2

	// SQL EXAMPLES

	%sql
	select title, url, num_comments, points
	from wtvJoined
	where topicId = 79
	order by month desc
	limit 500

	%sql
	select *
	from wtvTopicStrings
	order by id

	%sql
	select topicId, count(1) Count
	from wtvJoined
	group by topicId

	%sql
	select month, sum(num_comments) Comments, count(1) Articles, sum(points) Points
	from wtvJoined
	where topicId = 79
	group by month
	order by month