actuaryactually · February 20, 2017 04:56
diff --git a/01_Scrape_Locations.R b/01_Scrape_Locations.R


 #PRELIMINARIES:
  install.packages("pacman")
  pacman::p_load(XML,rvest,magrittr,RCurl,stringi)

 #Step 1: Establish location names and addresses for Harvey Norman Electrical stores

 #draw in address locations from website:
  url<-"http://www.harveynorman.co.nz/store-finder.html"
  rvest_target<-read_html(url)
  rvest_table_nodes <- html_nodes(rvest_target,"p.address ")
  addresses<-html_text(rvest_table_nodes,TRUE)

  ##Note - https://cran.r-project.org/web/packages/rvest/vignettes/selectorgadget.html
  ## This link contains details on how to use the SelectorGadget tool to identify CSS markers on webpages
  
  #remove nearly all html carriage returns and formatting
  addresses<-strsplit(addresses," \r\n\t")

 #write output to a matrix in which view map and other spaces are removed:
  size.reqd<-length(addresses)
  output.addresses<- matrix(nrow=size.reqd,ncol=1)
  for (i in 1:length(addresses)){
    output.addresses[i]<-unlist(strsplit(paste0(unlist(addresses[i]),sep = ", ", collapse = ""),"View Map,"))
  }
  print(output.addresses)

 #scan output for errors then manually correct: NB rows 2 and 37
  #row 2 first: 
    temp<-paste0(unlist(strsplit(output.addresses[2],"\r\n\t")),sep = ", ", collapse = "")
    nchar(temp) #66 characters long, so retain all but last two to purge extra comma 
    output.addresses[2]= substr(temp,start=1, stop=64)
    rm(temp)
  #then correct row 37: 
    temp<-paste0(unlist(strsplit(output.addresses[37],"\r\n")),sep = ", ", collapse = "")
    nchar(temp) #66 characters long, so retain all but last two to purge extra comma 
    output.addresses[37]= substr(temp,start=1, stop=46)
    rm(temp)
diff --git a/02_Assign_LatLong.R b/02_Assign_LatLong.R

 #PRELIMINARIES:
 pacman::p_load(XML)

 #recode as data frame :
 output.addresses.df<-data.frame(c(output.addresses))
 output.addresses.df<-cbind(output.addresses.df,long=1,lat=2)


 #assign long/lat
 for (i in 1:length(output.addresses)){
  addr <-output.addresses[i]
  url = paste('http://maps.google.com/maps/api/geocode/xml?address=', addr,'&sensor=false',sep='')  # construct the URL
  doc = xmlTreeParse(url) 
  root = xmlRoot(doc) 
  output.addresses.df[i,2] = xmlValue(root[['result']][['geometry']][['location']][['lng']]) 
  output.addresses.df[i,3] = xmlValue(root[['result']][['geometry']][['location']][['lat']]) 
 }
 print(output.addresses.df)

 #row 35 and 38 have problems - 35 is returning something with long/lats that are way off; and 38 is throwing an error.  
 #Manually searching google maps shows that the value for this row, needs a slight change.  Overwriting using the following
 #approach solves the problem....

 addr <-" Cnr West Street & Moore Street, Ashburton"
 url = paste('http://maps.google.com/maps/api/geocode/xml?address=', addr,'&sensor=false',sep='')  # construct the URL
 doc = xmlTreeParse(url) 
 root = xmlRoot(doc) 

 output.addresses.df[35,2] = xmlValue(root[['result']][['geometry']][['location']][['lng']]) 
 output.addresses.df[35,3] = xmlValue(root[['result']][['geometry']][['location']][['lat']]) 

 addr <-"Cnr Maclaggan Street & Rattray Street, Dunedin"
 url = paste('http://maps.google.com/maps/api/geocode/xml?address=', addr,'&sensor=false',sep='')  # construct the URL
 doc = xmlTreeParse(url) 
 root = xmlRoot(doc) 

 output.addresses.df[38,2] = xmlValue(root[['result']][['geometry']][['location']][['lng']]) 
 output.addresses.df[38,3] = xmlValue(root[['result']][['geometry']][['location']][['lat']]) 

 #all locations assigned now:
 print(output.addresses.df)

 #format long lat as numbers (needed later)
 colnames(output.addresses.df)<-c("Address","long","lat")
 output.addresses.df$Address<-as.character(output.addresses.df$Address)
 output.addresses.df$long<-as.numeric(output.addresses.df$long)
 output.addresses.df$lat<-as.numeric(output.addresses.df$lat)



 #major towns
 town.names<-c("Wellington, New Zealand", 
              "Picton, New Zealand", 
              "Nelson, New Zealand", 
              "Blenheim, New Zealand")
 major.towns.df<-data.frame(town.names)
 major.towns.df<-cbind(major.towns.df,long=1,lat=2)

 for (i in 1:length(town.names)){
  addr <-town.names[i]
  url = paste('http://maps.google.com/maps/api/geocode/xml?address=', addr,'&sensor=false',sep='')  # construct the URL
  doc = xmlTreeParse(url) 
  root = xmlRoot(doc) 
  major.towns.df[i,2] = xmlValue(root[['result']][['geometry']][['location']][['lng']]) 
  major.towns.df[i,3] = xmlValue(root[['result']][['geometry']][['location']][['lat']]) 
 }
 print(major.towns.df)


 town.names.short<-c("Wellington", 
                    "Picton", 
                    "Nelson", 
                    "Blenheim")
  
 major.towns.df$town.names<-town.names.short
 major.towns.df$long<-as.numeric(major.towns.df$long)
 major.towns.df$lat<-as.numeric(major.towns.df$lat)

diff --git a/03_Download_Quake_Data.R b/03_Download_Quake_Data.R

 #PRELIMINARIES:
 pacman::p_load(data.table)

 mydat1 <- fread('http://quakesearch.geonet.org.nz/csv?region=wellington&minmag=1&startdate=2014-05-01&enddate=2017-1-30T17:00:00')
 head(mydat)

 #the following site contains historic earthquake data for New Zealand: http://quakesearch.geonet.org.nz/
 #Using the build query option on this page, we can look-up quakes in specific regions and time frames.  
 #Let's look up Wellington's seismic profile since 1 Jan 2006 - i.e. around 10 years' worth...

 #three links are indicated, each covering a non overlapping date ranges:
  url1 <- 'http://quakesearch.geonet.org.nz/csv?region=wellington&minmag=1&startdate=2014-05-01&enddate=2017-1-30T17:00:00'
  url2 <- 'http://quakesearch.geonet.org.nz/csv?region=wellington&minmag=1&startdate=2011-07-01&enddate=2014-05-01'
  url3 <- 'http://quakesearch.geonet.org.nz/csv?region=wellington&minmag=1&startdate=2011-01-01T16:00:00&enddate=2011-07-01'

  mydat1 <- fread(url1)
  mydat2 <- fread(url2)
  mydat3 <- fread(url3)
  
  mydat <- rbind(mydat1,mydat2,mydat3)

 #review download then tidy up:
  head(mydat)
  rm(mydat1,mydat2,mydat3)

  colnames(mydat)[5] <- "long"
  colnames(mydat)[6] <- "lat"
diff --git a/04_Plot_Locations.R b/04_Plot_Locations.R

 pacman::p_load(ggmap,fitdistrplus,maps,mapproj,ggrepel)

 wellington = c(lon = 174.7762, lat =  -41.28646)
 wellington.map=get_map(location=wellington,zoom=8,color="bw")
    
 r <- ggmap(wellington.map,extent="panel",maprange=FALSE)%+% mydat + aes(x = long, y = lat)
 r <- r + geom_density2d()
 r <- r + stat_density2d(aes(fill = ..level.., alpha = ..level..), size = 0.1, bins = 100, geom = 'polygon') 
 r <- r + scale_fill_gradient(low = "green", high = "red") + scale_alpha(range = c(0.00, 0.25), guide = FALSE) 

 r <- r + geom_point(data=output.addresses.df, aes(x=long, y=lat), 
               color="blue",shape=15)
 r <- r + labs( 
  title = "NZ EQ hotspots 1 Jan 2006 - Present Vs Exposure: (i.e. location of Harvey Norman Store Outlets)", 
  subtitle = "The Blenheim store appears to be closest to an active zone", 
  caption = "Sources: http://info.geonet.org.nz/ and http://www.harveynorman.co.nz/store-finder.html") 
 r
diff --git a/05_Severity_Fitting.R b/05_Severity_Fitting.R
 pacman::p_load(fitdistrplus,reshape2,grid,gridExtra)
 #Note - you need to have already run the prior 4 scripts and have the variables in memory

 #severity over time:
 s <- ggplot(data=mydat,aes(x=as.Date(origintime),y=magnitude))
 s <- s + geom_point(alpha=1/25,color="coral1",shape=16) + geom_smooth(span=0.3,se=TRUE)
 s <- s + labs( 
  title = "Over Time...", 
  subtitle = "One circle per quake.  25 quakes to produce a solid pink circle. Blue line shows smoothed average (GAM model)", 
  caption = "Source: http://info.geonet.org.nz/"  
 ) 
 s <- s + xlab("") + ylab("Magnitude")
 s <- s + theme_minimal()

 #severity - not, you need to experiment with the binning to see the shape clearly
 t <- ggplot(data=mydat,aes(x=magnitude)) 
 t <- t + geom_histogram(binwidth = 0.1) + geom_histogram(aes(fill = ..count..)) + scale_fill_gradient("Magnitude", low = "green", high = "red")
 t <- t +labs( 
  title = "Observed variability", 
  caption = "Source: http://info.geonet.org.nz/"  
 ) 
 t <- t + theme_minimal()
 t <- t + xlab("Magnitude") + ylab("Count")

 #Try fitting the severity using parametric distribution:
 fg<-fitdist(mydat$magnitude,"gamma")
 fln<-fitdist(mydat$magnitude,"lnorm")
 fw<-fitdist(mydat$magnitude,"weibull")

 par(mfrow = c(2, 2))
 plot.legend <- c("Weibull", "lognormal", "gamma")
 denscomp(list(fw, fln, fg), legendtext = plot.legend)
 qqcomp(list(fw, fln, fg), legendtext = plot.legend)
 cdfcomp(list(fw, fln, fg), legendtext = plot.legend)
 ppcomp(list(fw, fln, fg), legendtext = plot.legend)

 ##Source: https://cran.r-project.org/web/packages/fitdistrplus/vignettes/paper2JSS.pdf

 #Check GOF using AIC...
 fln$aic
 fg$aic
 fw$aic

 #LgN is lowest, which agrees with visual QQ plot

 #Compare simulated vs 
 par(mfrow = c(1, 1))
 set.seed(3)
 sim.magnitudes<-rlnorm(10000,meanlog = fln$estimate[1],sdlog = fln$estimate[2])

 x<-data.frame(actual=mydat$magnitude,simulated=sim.magnitudes[1:length(mydat$magnitude)])
 quake.df<-melt(x) #puts from wide to long format
 u <- ggplot(data=quake.df,aes(x=value,fill=variable))+geom_density(alpha=0.25)
 u <- u +labs( 
  title = "Observed vs Simulated LgN", 
  caption = "Source: http://info.geonet.org.nz/"  
 ) 
 u <- u + theme_minimal()
 u <- u + xlab("Magnitude") + ylab("Density") 

 grid.arrange(s,t,u)


	#PRELIMINARIES:
	install.packages("pacman")
	pacman::p_load(XML,rvest,magrittr,RCurl,stringi)

	#Step 1: Establish location names and addresses for Harvey Norman Electrical stores

	#draw in address locations from website:
	url<-"http://www.harveynorman.co.nz/store-finder.html"
	rvest_target<-read_html(url)
	rvest_table_nodes <- html_nodes(rvest_target,"p.address ")
	addresses<-html_text(rvest_table_nodes,TRUE)

	##Note - https://cran.r-project.org/web/packages/rvest/vignettes/selectorgadget.html
	## This link contains details on how to use the SelectorGadget tool to identify CSS markers on webpages

	#remove nearly all html carriage returns and formatting
	addresses<-strsplit(addresses," \r\n\t")

	#write output to a matrix in which view map and other spaces are removed:
	size.reqd<-length(addresses)
	output.addresses<- matrix(nrow=size.reqd,ncol=1)
	for (i in 1:length(addresses)){
	output.addresses[i]<-unlist(strsplit(paste0(unlist(addresses[i]),sep = ", ", collapse = ""),"View Map,"))
	}
	print(output.addresses)

	#scan output for errors then manually correct: NB rows 2 and 37
	#row 2 first:
	temp<-paste0(unlist(strsplit(output.addresses[2],"\r\n\t")),sep = ", ", collapse = "")
	nchar(temp) #66 characters long, so retain all but last two to purge extra comma
	output.addresses[2]= substr(temp,start=1, stop=64)
	rm(temp)
	#then correct row 37:
	temp<-paste0(unlist(strsplit(output.addresses[37],"\r\n")),sep = ", ", collapse = "")
	nchar(temp) #66 characters long, so retain all but last two to purge extra comma
	output.addresses[37]= substr(temp,start=1, stop=46)
	rm(temp)

	#PRELIMINARIES:
	pacman::p_load(XML)

	#recode as data frame :
	output.addresses.df<-data.frame(c(output.addresses))
	output.addresses.df<-cbind(output.addresses.df,long=1,lat=2)


	#assign long/lat
	for (i in 1:length(output.addresses)){
	addr <-output.addresses[i]
	url = paste('http://maps.google.com/maps/api/geocode/xml?address=', addr,'&sensor=false',sep='') # construct the URL
	doc = xmlTreeParse(url)
	root = xmlRoot(doc)
	output.addresses.df[i,2] = xmlValue(root[['result']][['geometry']][['location']][['lng']])
	output.addresses.df[i,3] = xmlValue(root[['result']][['geometry']][['location']][['lat']])
	}
	print(output.addresses.df)

	#row 35 and 38 have problems - 35 is returning something with long/lats that are way off; and 38 is throwing an error.
	#Manually searching google maps shows that the value for this row, needs a slight change. Overwriting using the following
	#approach solves the problem....

	addr <-" Cnr West Street & Moore Street, Ashburton"
	url = paste('http://maps.google.com/maps/api/geocode/xml?address=', addr,'&sensor=false',sep='') # construct the URL
	doc = xmlTreeParse(url)
	root = xmlRoot(doc)

	output.addresses.df[35,2] = xmlValue(root[['result']][['geometry']][['location']][['lng']])
	output.addresses.df[35,3] = xmlValue(root[['result']][['geometry']][['location']][['lat']])

	addr <-"Cnr Maclaggan Street & Rattray Street, Dunedin"
	url = paste('http://maps.google.com/maps/api/geocode/xml?address=', addr,'&sensor=false',sep='') # construct the URL
	doc = xmlTreeParse(url)
	root = xmlRoot(doc)

	output.addresses.df[38,2] = xmlValue(root[['result']][['geometry']][['location']][['lng']])
	output.addresses.df[38,3] = xmlValue(root[['result']][['geometry']][['location']][['lat']])

	#all locations assigned now:
	print(output.addresses.df)

	#format long lat as numbers (needed later)
	colnames(output.addresses.df)<-c("Address","long","lat")
	output.addresses.df$Address<-as.character(output.addresses.df$Address)
	output.addresses.df$long<-as.numeric(output.addresses.df$long)
	output.addresses.df$lat<-as.numeric(output.addresses.df$lat)



	#major towns
	town.names<-c("Wellington, New Zealand",
	"Picton, New Zealand",
	"Nelson, New Zealand",
	"Blenheim, New Zealand")
	major.towns.df<-data.frame(town.names)
	major.towns.df<-cbind(major.towns.df,long=1,lat=2)

	for (i in 1:length(town.names)){
	addr <-town.names[i]
	url = paste('http://maps.google.com/maps/api/geocode/xml?address=', addr,'&sensor=false',sep='') # construct the URL
	doc = xmlTreeParse(url)
	root = xmlRoot(doc)
	major.towns.df[i,2] = xmlValue(root[['result']][['geometry']][['location']][['lng']])
	major.towns.df[i,3] = xmlValue(root[['result']][['geometry']][['location']][['lat']])
	}
	print(major.towns.df)


	town.names.short<-c("Wellington",
	"Picton",
	"Nelson",
	"Blenheim")

	major.towns.df$town.names<-town.names.short
	major.towns.df$long<-as.numeric(major.towns.df$long)
	major.towns.df$lat<-as.numeric(major.towns.df$lat)

	#PRELIMINARIES:
	pacman::p_load(data.table)

	mydat1 <- fread('http://quakesearch.geonet.org.nz/csv?region=wellington&minmag=1&startdate=2014-05-01&enddate=2017-1-30T17:00:00')
	head(mydat)

	#the following site contains historic earthquake data for New Zealand: http://quakesearch.geonet.org.nz/
	#Using the build query option on this page, we can look-up quakes in specific regions and time frames.
	#Let's look up Wellington's seismic profile since 1 Jan 2006 - i.e. around 10 years' worth...

	#three links are indicated, each covering a non overlapping date ranges:
	url1 <- 'http://quakesearch.geonet.org.nz/csv?region=wellington&minmag=1&startdate=2014-05-01&enddate=2017-1-30T17:00:00'
	url2 <- 'http://quakesearch.geonet.org.nz/csv?region=wellington&minmag=1&startdate=2011-07-01&enddate=2014-05-01'
	url3 <- 'http://quakesearch.geonet.org.nz/csv?region=wellington&minmag=1&startdate=2011-01-01T16:00:00&enddate=2011-07-01'

	mydat1 <- fread(url1)
	mydat2 <- fread(url2)
	mydat3 <- fread(url3)

	mydat <- rbind(mydat1,mydat2,mydat3)

	#review download then tidy up:
	head(mydat)
	rm(mydat1,mydat2,mydat3)

	colnames(mydat)[5] <- "long"
	colnames(mydat)[6] <- "lat"

	pacman::p_load(ggmap,fitdistrplus,maps,mapproj,ggrepel)

	wellington = c(lon = 174.7762, lat = -41.28646)
	wellington.map=get_map(location=wellington,zoom=8,color="bw")

	r <- ggmap(wellington.map,extent="panel",maprange=FALSE)%+% mydat + aes(x = long, y = lat)
	r <- r + geom_density2d()
	r <- r + stat_density2d(aes(fill = ..level.., alpha = ..level..), size = 0.1, bins = 100, geom = 'polygon')
	r <- r + scale_fill_gradient(low = "green", high = "red") + scale_alpha(range = c(0.00, 0.25), guide = FALSE)

	r <- r + geom_point(data=output.addresses.df, aes(x=long, y=lat),
	color="blue",shape=15)
	r <- r + labs(
	title = "NZ EQ hotspots 1 Jan 2006 - Present Vs Exposure: (i.e. location of Harvey Norman Store Outlets)",
	subtitle = "The Blenheim store appears to be closest to an active zone",
	caption = "Sources: http://info.geonet.org.nz/ and http://www.harveynorman.co.nz/store-finder.html")
	r
	pacman::p_load(fitdistrplus,reshape2,grid,gridExtra)
	#Note - you need to have already run the prior 4 scripts and have the variables in memory

	#severity over time:
	s <- ggplot(data=mydat,aes(x=as.Date(origintime),y=magnitude))
	s <- s + geom_point(alpha=1/25,color="coral1",shape=16) + geom_smooth(span=0.3,se=TRUE)
	s <- s + labs(
	title = "Over Time...",
	subtitle = "One circle per quake. 25 quakes to produce a solid pink circle. Blue line shows smoothed average (GAM model)",
	caption = "Source: http://info.geonet.org.nz/"
	)
	s <- s + xlab("") + ylab("Magnitude")
	s <- s + theme_minimal()

	#severity - not, you need to experiment with the binning to see the shape clearly
	t <- ggplot(data=mydat,aes(x=magnitude))
	t <- t + geom_histogram(binwidth = 0.1) + geom_histogram(aes(fill = ..count..)) + scale_fill_gradient("Magnitude", low = "green", high = "red")
	t <- t +labs(
	title = "Observed variability",
	caption = "Source: http://info.geonet.org.nz/"
	)
	t <- t + theme_minimal()
	t <- t + xlab("Magnitude") + ylab("Count")

	#Try fitting the severity using parametric distribution:
	fg<-fitdist(mydat$magnitude,"gamma")
	fln<-fitdist(mydat$magnitude,"lnorm")
	fw<-fitdist(mydat$magnitude,"weibull")

	par(mfrow = c(2, 2))
	plot.legend <- c("Weibull", "lognormal", "gamma")
	denscomp(list(fw, fln, fg), legendtext = plot.legend)
	qqcomp(list(fw, fln, fg), legendtext = plot.legend)
	cdfcomp(list(fw, fln, fg), legendtext = plot.legend)
	ppcomp(list(fw, fln, fg), legendtext = plot.legend)

	##Source: https://cran.r-project.org/web/packages/fitdistrplus/vignettes/paper2JSS.pdf

	#Check GOF using AIC...
	fln$aic
	fg$aic
	fw$aic

	#LgN is lowest, which agrees with visual QQ plot

	#Compare simulated vs
	par(mfrow = c(1, 1))
	set.seed(3)
	sim.magnitudes<-rlnorm(10000,meanlog = fln$estimate[1],sdlog = fln$estimate[2])

	x<-data.frame(actual=mydat$magnitude,simulated=sim.magnitudes[1:length(mydat$magnitude)])
	quake.df<-melt(x) #puts from wide to long format
	u <- ggplot(data=quake.df,aes(x=value,fill=variable))+geom_density(alpha=0.25)
	u <- u +labs(
	title = "Observed vs Simulated LgN",
	caption = "Source: http://info.geonet.org.nz/"
	)
	u <- u + theme_minimal()
	u <- u + xlab("Magnitude") + ylab("Density")

	grid.arrange(s,t,u)