Last active
June 16, 2016 14:02
-
-
Save TomLous/c7a3134969c64b9d8e2dc1d2a6426b7c to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
val conf = new SparkConf().setAppName("SparkTest").setMaster("local[1]") | |
val sparkContext = new SparkContext(conf) | |
val sqlContext = new SQLContext(sparkContext) | |
import sqlContext.implicits.{getClass => _, _} | |
val fileNameGoogle = "/google-places-data.json" // 1 json places per line | |
val locationsGoogle = sqlContext.read.json(getClass.getResource(fileNameGoogle).getPath). | |
toDF(). | |
withColumn("address_components", explode($"address_components")). | |
select( | |
$"address_components.long_name", | |
$"address_components.short_name", | |
$"address_components.types".getItem(0).alias("type"), | |
$"geometry.location.lat", | |
$"geometry.location.lng", | |
$"place_id".alias("id") | |
). | |
groupBy("id","lat","lng"). | |
pivot("type"). | |
agg(first("long_name"), first("short_name")). | |
select( | |
$"id".alias("googleId"), | |
$"street_number_first(long_name)()".alias("houseNumber"), | |
$"route_first(long_name)()".alias("street"), | |
$"sublocality_level_1_first(long_name)()".alias("area"), | |
$"sublocality_level_2_first(long_name)()".alias("municipality"), | |
$"locality_first(long_name)()".alias("city"), | |
$"administrative_area_level_2_first(long_name)()".alias("nuts3"), | |
$"neighborhood_first(long_name)()".alias("neighbourhood"), | |
$"administrative_area_level_1_first(long_name)()".alias("nuts2"), | |
$"administrative_area_level_1_first(short_name)()".alias("nuts2Code"), | |
$"country_first(short_name)()".alias("countryCode"), | |
$"country_first(long_name)()".alias("country"), | |
$"postal_code_first(long_name)()".alias("postalCode"), | |
$"postal_code_prefix_first(long_name)()".alias("postalCodePrefix"), | |
$"lat".alias("latitude"), | |
$"lng".alias("longitude") | |
). | |
show() |
Probably not the most efficient way, but semi-readable, cotains many learnings about sparkSQL and it actually works in spark 1.6.1
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Converts the complex address_components structure in google places data into DataFrame table
{"address_components" : [ { "long_name" : "32", "short_name" : "32", "types" : [ "street_number" ] }, { "long_name" : "Parkstraat", "short_name" : "Parkstraat", "types" : [ "route" ] }, { "long_name" : "Centrum", "short_name" : "Centrum", "types" : [ "sublocality_level_1", "sublocality", "political" ] }, { "long_name" : "Den Haag", "short_name" : "Den Haag", "types" : [ "locality", "political" ] }, { "long_name" : "Den Haag", "short_name" : "Den Haag", "types" : [ "administrative_area_level_2", "political" ] }, { "long_name" : "Zuid-Holland", "short_name" : "ZH", "types" : [ "administrative_area_level_1", "political" ] }, { "long_name" : "Netherlands", "short_name" : "NL", "types" : [ "country", "political" ] }, { "long_name" : "2514 JK", "short_name" : "2514 JK", "types" : [ "postal_code" ] } ], ... }
+--------------------+-----------+--------------------+-----------+------------+--------+--------+-------------+------------+---------+-----------+-----------+----------+----------------+-----------------+-----------------+
| googleId|houseNumber| street| area|municipality| city| nuts3|neighbourhood| nuts2|nuts2Code|countryCode| country|postalCode|postalCodePrefix| latitude| longitude|
+--------------------+-----------+--------------------+-----------+------------+--------+--------+-------------+------------+---------+-----------+-----------+----------+----------------+-----------------+-----------------+
|ChIJ5SkA8iK3xUcRZ...| 32| Parkstraat| Centrum| null|Den Haag|Den Haag| null|Zuid-Holland| ZH| NL|Netherlands| 2514 JK| null| 52.0839421| 4.3069938|