Last active
June 16, 2016 14:02
-
-
Save TomLous/c7a3134969c64b9d8e2dc1d2a6426b7c to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| val conf = new SparkConf().setAppName("SparkTest").setMaster("local[1]") | |
| val sparkContext = new SparkContext(conf) | |
| val sqlContext = new SQLContext(sparkContext) | |
| import sqlContext.implicits.{getClass => _, _} | |
| val fileNameGoogle = "/google-places-data.json" // 1 json places per line | |
| val locationsGoogle = sqlContext.read.json(getClass.getResource(fileNameGoogle).getPath). | |
| toDF(). | |
| withColumn("address_components", explode($"address_components")). | |
| select( | |
| $"address_components.long_name", | |
| $"address_components.short_name", | |
| $"address_components.types".getItem(0).alias("type"), | |
| $"geometry.location.lat", | |
| $"geometry.location.lng", | |
| $"place_id".alias("id") | |
| ). | |
| groupBy("id","lat","lng"). | |
| pivot("type"). | |
| agg(first("long_name"), first("short_name")). | |
| select( | |
| $"id".alias("googleId"), | |
| $"street_number_first(long_name)()".alias("houseNumber"), | |
| $"route_first(long_name)()".alias("street"), | |
| $"sublocality_level_1_first(long_name)()".alias("area"), | |
| $"sublocality_level_2_first(long_name)()".alias("municipality"), | |
| $"locality_first(long_name)()".alias("city"), | |
| $"administrative_area_level_2_first(long_name)()".alias("nuts3"), | |
| $"neighborhood_first(long_name)()".alias("neighbourhood"), | |
| $"administrative_area_level_1_first(long_name)()".alias("nuts2"), | |
| $"administrative_area_level_1_first(short_name)()".alias("nuts2Code"), | |
| $"country_first(short_name)()".alias("countryCode"), | |
| $"country_first(long_name)()".alias("country"), | |
| $"postal_code_first(long_name)()".alias("postalCode"), | |
| $"postal_code_prefix_first(long_name)()".alias("postalCodePrefix"), | |
| $"lat".alias("latitude"), | |
| $"lng".alias("longitude") | |
| ). | |
| show() |
Author
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Probably not the most efficient way, but semi-readable, cotains many learnings about sparkSQL and it actually works in spark 1.6.1