Last active
October 17, 2016 21:06
-
-
Save zouzias/ae4fb6a6d349f3aa30021e07c6466cd8 to your computer and use it in GitHub Desktop.
Max mind cities example
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{"paragraphs":[{"text":"// Add this in the interpreter\n// %dep\n// z.addRepo(\"Spark Packages Repo\").url(\"http://dl.bintray.com/spark-packages/maven\")\n// z.load(\"org.zouzias:spark-lucenerdd_2.11:0.2.1\")\n\nval citiesDF = sqlContext.read.parquet(\"s3://recordlinkage/world-cities-maxmind.parquet\")\ncitiesDF.cache\nval total = citiesDF.count\n\nprintln(s\"Cities: ${total}\")","dateUpdated":"2016-10-17T21:01:57+0000","config":{"colWidth":12,"editorMode":"ace/mode/scala","graph":{"mode":"table","height":300,"optionOpen":false,"keys":[],"values":[],"groups":[],"scatter":{}},"enabled":true},"settings":{"params":{},"forms":{}},"jobName":"paragraph_1476736326171_94517929","id":"20161017-201243_2050124200","result":{"code":"SUCCESS","type":"TEXT","msg":"citiesDF: org.apache.spark.sql.DataFrame = [Country: string, City: string ... 5 more fields]\nres1: citiesDF.type = [Country: string, City: string ... 5 more fields]\ntotal: Long = 3173958\nCities: 3173958\n"},"dateCreated":"2016-10-17T20:32:06+0000","dateStarted":"2016-10-17T21:01:57+0000","dateFinished":"2016-10-17T21:02:53+0000","status":"FINISHED","progressUpdateIntervalMs":500,"$$hashKey":"object:1165"},{"text":"import org.zouzias.spark.lucenerdd.facets.FacetedLuceneRDD\nimport org.apache.spark.sql.DataFrame\n\nval fieldName = \"Country\"\nval k: Int = 20\n\ncitiesDF.cache()\ncitiesDF.count()\nval luceneRDD = FacetedLuceneRDD(citiesDF)\nluceneRDD.cache()\nluceneRDD.count()\n \ndef benchmark(citiesDF: DataFrame, luceneRDD: FacetedLuceneRDD, k: Int): (Long, Long) = {\n // Run DataFrame groupBy count and sort\n \n val dfStart = System.currentTimeMillis()\n val dfResults = citiesDF.groupBy(fieldName).count().sort(desc(\"count\")).take(k)\n val dfEnd = System.currentTimeMillis()\n \n // Run Faceted search\n val lucStart =System.currentTimeMillis()\n luceneRDD.facetQuery(\"*:*\", fieldName, facetNum = k)\n val lucEnd =System.currentTimeMillis()\n \n (dfEnd - dfStart, lucEnd - lucStart)\n}","dateUpdated":"2016-10-17T21:01:57+0000","config":{"colWidth":12,"editorMode":"ace/mode/scala","graph":{"mode":"table","height":300,"optionOpen":false,"keys":[],"values":[],"groups":[],"scatter":{}},"enabled":true},"settings":{"params":{},"forms":{}},"jobName":"paragraph_1476736326173_92209436","id":"20161017-201314_978365077","result":{"code":"SUCCESS","type":"TEXT","msg":"import org.zouzias.spark.lucenerdd.facets.FacetedLuceneRDD\nimport org.apache.spark.sql.DataFrame\nfieldName: String = Country\nk: Int = 20\nbenchmark: (citiesDF: org.apache.spark.sql.DataFrame, k: Int)(Long, Long)\n"},"dateCreated":"2016-10-17T20:32:06+0000","dateStarted":"2016-10-17T21:01:59+0000","dateFinished":"2016-10-17T20:56:53+0000","status":"RUNNING","progressUpdateIntervalMs":500,"$$hashKey":"object:1166"},{"text":"val results = List(3, 5).map(x => benchmark(citiesDF,x))\n\n\nresults.foreach{ case (dfTime, lucTime) =>\n println(\"=\" * 20)\n println(s\"DF time: ${ dfTime / 1000D } seconds\")\n println(s\"Lucene time: ${lucTime / 1000D} seconds\")\n println(\"=\" * 20)\n}\n","dateUpdated":"2016-10-17T21:01:57+0000","config":{"colWidth":12,"editorMode":"ace/mode/scala","graph":{"mode":"table","height":300,"optionOpen":false,"keys":[],"values":[],"groups":[],"scatter":{}},"enabled":true},"settings":{"params":{},"forms":{}},"jobName":"paragraph_1476736326173_92209436","id":"20161017-201519_1499660531","result":{"code":"ERROR","type":"TEXT","msg":"org.apache.spark.SparkException: Job 7 cancelled part of cancelled job group zeppelin-20161017-201519_1499660531\n at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1450)\n at org.apache.spark.scheduler.DAGScheduler.handleJobCancellation(DAGScheduler.scala:1389)\n at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleJobGroupCancelled$1.apply$mcVI$sp(DAGScheduler.scala:795)\n at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleJobGroupCancelled$1.apply(DAGScheduler.scala:795)\n at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleJobGroupCancelled$1.apply(DAGScheduler.scala:795)\n at scala.collection.mutable.HashSet.foreach(HashSet.scala:78)\n at org.apache.spark.scheduler.DAGScheduler.handleJobGroupCancelled(DAGScheduler.scala:795)\n at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1638)\n at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1618)\n at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1607)\n at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)\n at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:632)\n at org.apache.spark.SparkContext.runJob(SparkContext.scala:1871)\n at org.apache.spark.SparkContext.runJob(SparkContext.scala:1934)\n at org.apache.spark.rdd.RDD$$anonfun$reduce$1.apply(RDD.scala:983)\n at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)\n at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)\n at org.apache.spark.rdd.RDD.withScope(RDD.scala:358)\n at org.apache.spark.rdd.RDD.reduce(RDD.scala:965)\n at org.zouzias.spark.lucenerdd.LuceneRDD.count(LuceneRDD.scala:249)\n at benchmark(<console>:46)\n at $anonfun$1.apply(<console>:37)\n at $anonfun$1.apply(<console>:37)\n at scala.collection.immutable.List.map(List.scala:273)\n ... 46 elided\n"},"dateCreated":"2016-10-17T20:32:06+0000","dateStarted":"2016-10-17T20:58:35+0000","dateFinished":"2016-10-17T21:00:57+0000","status":"PENDING","progressUpdateIntervalMs":500,"$$hashKey":"object:1167"},{"dateUpdated":"2016-10-17T21:01:57+0000","config":{"colWidth":12,"graph":{"mode":"table","height":300,"optionOpen":false,"keys":[],"values":[],"groups":[],"scatter":{}},"enabled":true,"editorMode":"ace/mode/scala"},"settings":{"params":{},"forms":{}},"jobName":"paragraph_1476737085722_79436625","id":"20161017-204445_262828165","result":{"code":"SUCCESS","type":"TEXT","msg":""},"dateCreated":"2016-10-17T20:44:45+0000","dateStarted":"2016-10-17T20:56:53+0000","dateFinished":"2016-10-17T20:58:29+0000","status":"PENDING","progressUpdateIntervalMs":500,"$$hashKey":"object:1168"}],"name":"Max mind cities","id":"2C1R83USQ","angularObjects":{"2BRWU4WXC:shared_process":[],"2AM1YV5CU:shared_process":[],"2AJXGMUUJ:shared_process":[],"2ANGGHHMQ:shared_process":[],"2AKK3QQXU:shared_process":[]},"config":{"looknfeel":"default"},"info":{}} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment