arthurwuhoo · June 9, 2016 10:18
diff --git a/Day 8 Scraping via API Exercise Solutions.R b/Day 8 Scraping via API Exercise Solutions.R

 #EXERCISE 1
 #Identify the various data types in the following JSON document:
 #----------------------------------------------------------------

 # Before you can read in the JSON, you have to fix it. This was a little unfair
 # to presume, so sorry about that. However, debugging is such a central component
 # to working in R across lots of different datasets. 
 
 # There were a few errors
 # in the JSON file that do not allow it to be read into R correctly. The first one
 # is an error in the GENDER line (there needs to be an extra quote mark) and the
 # second one is in the COUNTRY line, which needs to have a comma after South Africa.
 # 
 # After you fix this in a text editor of your choice (TextEdit on macs is the default),
 # then save the file again and then read it into the R like we do below.


 library(jsonlite)
 library(rjson)
 result <- fromJSON(file = "sample.JSON")
 str(result)


 #----------------------------------------------------------------
 #EXERCISE 8
 #Let’s say we’re interested in analysing the geographic data for all names and locations of 
 #licensed spirit bottlers and producers in the US. We’re interested in finding:

 #1) Which ZIP code has the most licensed vendors per capita?
 #2) Using the Google Maps API, find the states for the top 50 ZIP codes by licensed vendors per capita.

 # You’re given two datasets: a JSON file representing all of the names and locations of licensed
 #spirit bottlers and producers and a CSV file giving US population by ZIP Code.

 #Here’s a hint on how to find the states (without searching every ZIP code or using a ZIP code to 
 #state reference file):
 #----------------------------------------------------------------

 # Reading in this JSON seemed to be super frustrating. I (Arthur) tried several methods
 # across different JSON packages, and settled on using jsonlite because it involved
 # the least amount of work.

 # IMPORTANT - three R packages use the same two fromJSON and toJSON commands, despite
 # uniquely different functionality for each. That's why the "masking" warning happens
 # when you load each of those different packages after another.

 install.packages("jsonlite") #reinstall this to get the right fromJSON.
 library(jsonlite)

 spirits_df <- fromJSON("Spirits.JSON", simplifyDataFrame = TRUE)
 spirits_df #see? a pretty data frame


 #let's tackle the first question - the most vendors per capita.
 #First, lets find the number of vendors per zipcode. We'll use the aggregation
 #functions in dplyr to make this easier.

 head(spirits_df)

 spirits_df$zipchar <- as.character(spirits_df$ZIP)

 library(dplyr)
 (count_by_zip <- as.data.frame(spirits_df %>% group_by(zipchar) %>% summarise(count = n()) %>% arrange(desc(count))))

 head(count_by_zip)
 #this shows that the zip code with the most licensed vendors is 93446
 # 93446 corresponds to Paso Robles, CA
 # The wiki page for Paso Robles shows the following:

 #Located on the Salinas River north of San Luis Obispo, California, the city is known for its hot 
 #springs, its abundance of wineries, production of olive oil, almond orchards, and for playing 
 #host to the California Mid-State Fair.
 # Cool!

 #now lets get the population for each zip code and merge it into this dataset.
 population <- read.csv("2010-Census-Population-By-Zipcode.csv", stringsAsFactors = FALSE)

 #cool, now let's do an inner join on the zip code.

 colnames(count_by_zip)
 colnames(population)
 colnames(population) <- c("zipchar", "population")
 population$zipchar <- as.character(population$zipchar)

 #our new dataset
 zip_vendor_and_pop <- inner_join(count_by_zip, population, by = "zipchar") 

 #making the per capita variable
 zip_vendor_and_pop$vendorspercap <- zip_vendor_and_pop$count/zip_vendor_and_pop$population

 #sorting it from the highest to lowest per capita
 zip_vendor_and_pop_sorted <- zip_vendor_and_pop %>% arrange(desc(vendorspercap))

 top50 <- zip_vendor_and_pop_sorted[1:50,] #we're done with part 1.


 ######## FINDING STATES for the top 50.

 library(ggmap)
 zipdata <- top50$zipchar
 zip_list <- geocode(zipdata, output='latlona', messaging = TRUE)
 cityandstate <- zip_list$address

 stateandzip <- as.data.frame(strsplit(cityandstate, ",", fixed = FALSE, perl = FALSE, useBytes = FALSE))[2,] #use regex to get the states out!

	#EXERCISE 1
	#Identify the various data types in the following JSON document:
	#----------------------------------------------------------------

	# Before you can read in the JSON, you have to fix it. This was a little unfair
	# to presume, so sorry about that. However, debugging is such a central component
	# to working in R across lots of different datasets.

	# There were a few errors
	# in the JSON file that do not allow it to be read into R correctly. The first one
	# is an error in the GENDER line (there needs to be an extra quote mark) and the
	# second one is in the COUNTRY line, which needs to have a comma after South Africa.
	#
	# After you fix this in a text editor of your choice (TextEdit on macs is the default),
	# then save the file again and then read it into the R like we do below.


	library(jsonlite)
	library(rjson)
	result <- fromJSON(file = "sample.JSON")
	str(result)


	#----------------------------------------------------------------
	#EXERCISE 8
	#Let’s say we’re interested in analysing the geographic data for all names and locations of
	#licensed spirit bottlers and producers in the US. We’re interested in finding:

	#1) Which ZIP code has the most licensed vendors per capita?
	#2) Using the Google Maps API, find the states for the top 50 ZIP codes by licensed vendors per capita.

	# You’re given two datasets: a JSON file representing all of the names and locations of licensed
	#spirit bottlers and producers and a CSV file giving US population by ZIP Code.

	#Here’s a hint on how to find the states (without searching every ZIP code or using a ZIP code to
	#state reference file):
	#----------------------------------------------------------------

	# Reading in this JSON seemed to be super frustrating. I (Arthur) tried several methods
	# across different JSON packages, and settled on using jsonlite because it involved
	# the least amount of work.

	# IMPORTANT - three R packages use the same two fromJSON and toJSON commands, despite
	# uniquely different functionality for each. That's why the "masking" warning happens
	# when you load each of those different packages after another.

	install.packages("jsonlite") #reinstall this to get the right fromJSON.
	library(jsonlite)

	spirits_df <- fromJSON("Spirits.JSON", simplifyDataFrame = TRUE)
	spirits_df #see? a pretty data frame


	#let's tackle the first question - the most vendors per capita.
	#First, lets find the number of vendors per zipcode. We'll use the aggregation
	#functions in dplyr to make this easier.

	head(spirits_df)

	spirits_df$zipchar <- as.character(spirits_df$ZIP)

	library(dplyr)
	(count_by_zip <- as.data.frame(spirits_df %>% group_by(zipchar) %>% summarise(count = n()) %>% arrange(desc(count))))

	head(count_by_zip)
	#this shows that the zip code with the most licensed vendors is 93446
	# 93446 corresponds to Paso Robles, CA
	# The wiki page for Paso Robles shows the following:

	#Located on the Salinas River north of San Luis Obispo, California, the city is known for its hot
	#springs, its abundance of wineries, production of olive oil, almond orchards, and for playing
	#host to the California Mid-State Fair.
	# Cool!

	#now lets get the population for each zip code and merge it into this dataset.
	population <- read.csv("2010-Census-Population-By-Zipcode.csv", stringsAsFactors = FALSE)

	#cool, now let's do an inner join on the zip code.

	colnames(count_by_zip)
	colnames(population)
	colnames(population) <- c("zipchar", "population")
	population$zipchar <- as.character(population$zipchar)

	#our new dataset
	zip_vendor_and_pop <- inner_join(count_by_zip, population, by = "zipchar")

	#making the per capita variable
	zip_vendor_and_pop$vendorspercap <- zip_vendor_and_pop$count/zip_vendor_and_pop$population

	#sorting it from the highest to lowest per capita
	zip_vendor_and_pop_sorted <- zip_vendor_and_pop %>% arrange(desc(vendorspercap))

	top50 <- zip_vendor_and_pop_sorted[1:50,] #we're done with part 1.


	######## FINDING STATES for the top 50.

	library(ggmap)
	zipdata <- top50$zipchar
	zip_list <- geocode(zipdata, output='latlona', messaging = TRUE)
	cityandstate <- zip_list$address

	stateandzip <- as.data.frame(strsplit(cityandstate, ",", fixed = FALSE, perl = FALSE, useBytes = FALSE))[2,] #use regex to get the states out!