Skip to content

Instantly share code, notes, and snippets.

@zouzias
Last active October 17, 2016 21:06
Show Gist options
  • Save zouzias/ae4fb6a6d349f3aa30021e07c6466cd8 to your computer and use it in GitHub Desktop.
Save zouzias/ae4fb6a6d349f3aa30021e07c6466cd8 to your computer and use it in GitHub Desktop.
Max mind cities example
{"paragraphs":[{"text":"// Add this in the interpreter\n// %dep\n// z.addRepo(\"Spark Packages Repo\").url(\"http://dl.bintray.com/spark-packages/maven\")\n// z.load(\"org.zouzias:spark-lucenerdd_2.11:0.2.1\")\n\nval citiesDF = sqlContext.read.parquet(\"s3://recordlinkage/world-cities-maxmind.parquet\")\ncitiesDF.cache\nval total = citiesDF.count\n\nprintln(s\"Cities: ${total}\")","dateUpdated":"2016-10-17T21:01:57+0000","config":{"colWidth":12,"editorMode":"ace/mode/scala","graph":{"mode":"table","height":300,"optionOpen":false,"keys":[],"values":[],"groups":[],"scatter":{}},"enabled":true},"settings":{"params":{},"forms":{}},"jobName":"paragraph_1476736326171_94517929","id":"20161017-201243_2050124200","result":{"code":"SUCCESS","type":"TEXT","msg":"citiesDF: org.apache.spark.sql.DataFrame = [Country: string, City: string ... 5 more fields]\nres1: citiesDF.type = [Country: string, City: string ... 5 more fields]\ntotal: Long = 3173958\nCities: 3173958\n"},"dateCreated":"2016-10-17T20:32:06+0000","dateStarted":"2016-10-17T21:01:57+0000","dateFinished":"2016-10-17T21:02:53+0000","status":"FINISHED","progressUpdateIntervalMs":500,"$$hashKey":"object:1165"},{"text":"import org.zouzias.spark.lucenerdd.facets.FacetedLuceneRDD\nimport org.apache.spark.sql.DataFrame\n\nval fieldName = \"Country\"\nval k: Int = 20\n\ncitiesDF.cache()\ncitiesDF.count()\nval luceneRDD = FacetedLuceneRDD(citiesDF)\nluceneRDD.cache()\nluceneRDD.count()\n \ndef benchmark(citiesDF: DataFrame, luceneRDD: FacetedLuceneRDD, k: Int): (Long, Long) = {\n // Run DataFrame groupBy count and sort\n \n val dfStart = System.currentTimeMillis()\n val dfResults = citiesDF.groupBy(fieldName).count().sort(desc(\"count\")).take(k)\n val dfEnd = System.currentTimeMillis()\n \n // Run Faceted search\n val lucStart =System.currentTimeMillis()\n luceneRDD.facetQuery(\"*:*\", fieldName, facetNum = k)\n val lucEnd =System.currentTimeMillis()\n \n (dfEnd - dfStart, lucEnd - lucStart)\n}","dateUpdated":"2016-10-17T21:01:57+0000","config":{"colWidth":12,"editorMode":"ace/mode/scala","graph":{"mode":"table","height":300,"optionOpen":false,"keys":[],"values":[],"groups":[],"scatter":{}},"enabled":true},"settings":{"params":{},"forms":{}},"jobName":"paragraph_1476736326173_92209436","id":"20161017-201314_978365077","result":{"code":"SUCCESS","type":"TEXT","msg":"import org.zouzias.spark.lucenerdd.facets.FacetedLuceneRDD\nimport org.apache.spark.sql.DataFrame\nfieldName: String = Country\nk: Int = 20\nbenchmark: (citiesDF: org.apache.spark.sql.DataFrame, k: Int)(Long, Long)\n"},"dateCreated":"2016-10-17T20:32:06+0000","dateStarted":"2016-10-17T21:01:59+0000","dateFinished":"2016-10-17T20:56:53+0000","status":"RUNNING","progressUpdateIntervalMs":500,"$$hashKey":"object:1166"},{"text":"val results = List(3, 5).map(x => benchmark(citiesDF,x))\n\n\nresults.foreach{ case (dfTime, lucTime) =>\n println(\"=\" * 20)\n println(s\"DF time: ${ dfTime / 1000D } seconds\")\n println(s\"Lucene time: ${lucTime / 1000D} seconds\")\n println(\"=\" * 20)\n}\n","dateUpdated":"2016-10-17T21:01:57+0000","config":{"colWidth":12,"editorMode":"ace/mode/scala","graph":{"mode":"table","height":300,"optionOpen":false,"keys":[],"values":[],"groups":[],"scatter":{}},"enabled":true},"settings":{"params":{},"forms":{}},"jobName":"paragraph_1476736326173_92209436","id":"20161017-201519_1499660531","result":{"code":"ERROR","type":"TEXT","msg":"org.apache.spark.SparkException: Job 7 cancelled part of cancelled job group zeppelin-20161017-201519_1499660531\n at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1450)\n at org.apache.spark.scheduler.DAGScheduler.handleJobCancellation(DAGScheduler.scala:1389)\n at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleJobGroupCancelled$1.apply$mcVI$sp(DAGScheduler.scala:795)\n at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleJobGroupCancelled$1.apply(DAGScheduler.scala:795)\n at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleJobGroupCancelled$1.apply(DAGScheduler.scala:795)\n at scala.collection.mutable.HashSet.foreach(HashSet.scala:78)\n at org.apache.spark.scheduler.DAGScheduler.handleJobGroupCancelled(DAGScheduler.scala:795)\n at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1638)\n at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1618)\n at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1607)\n at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)\n at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:632)\n at org.apache.spark.SparkContext.runJob(SparkContext.scala:1871)\n at org.apache.spark.SparkContext.runJob(SparkContext.scala:1934)\n at org.apache.spark.rdd.RDD$$anonfun$reduce$1.apply(RDD.scala:983)\n at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)\n at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)\n at org.apache.spark.rdd.RDD.withScope(RDD.scala:358)\n at org.apache.spark.rdd.RDD.reduce(RDD.scala:965)\n at org.zouzias.spark.lucenerdd.LuceneRDD.count(LuceneRDD.scala:249)\n at benchmark(<console>:46)\n at $anonfun$1.apply(<console>:37)\n at $anonfun$1.apply(<console>:37)\n at scala.collection.immutable.List.map(List.scala:273)\n ... 46 elided\n"},"dateCreated":"2016-10-17T20:32:06+0000","dateStarted":"2016-10-17T20:58:35+0000","dateFinished":"2016-10-17T21:00:57+0000","status":"PENDING","progressUpdateIntervalMs":500,"$$hashKey":"object:1167"},{"dateUpdated":"2016-10-17T21:01:57+0000","config":{"colWidth":12,"graph":{"mode":"table","height":300,"optionOpen":false,"keys":[],"values":[],"groups":[],"scatter":{}},"enabled":true,"editorMode":"ace/mode/scala"},"settings":{"params":{},"forms":{}},"jobName":"paragraph_1476737085722_79436625","id":"20161017-204445_262828165","result":{"code":"SUCCESS","type":"TEXT","msg":""},"dateCreated":"2016-10-17T20:44:45+0000","dateStarted":"2016-10-17T20:56:53+0000","dateFinished":"2016-10-17T20:58:29+0000","status":"PENDING","progressUpdateIntervalMs":500,"$$hashKey":"object:1168"}],"name":"Max mind cities","id":"2C1R83USQ","angularObjects":{"2BRWU4WXC:shared_process":[],"2AM1YV5CU:shared_process":[],"2AJXGMUUJ:shared_process":[],"2ANGGHHMQ:shared_process":[],"2AKK3QQXU:shared_process":[]},"config":{"looknfeel":"default"},"info":{}}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment