johnDorian · September 22, 2010 00:45
diff --git a/code_Burke_TP.R b/code_Burke_TP.R
 #####################################################################################################
 ### Aim: To test the significance of randomly allocating flow to 'midnight' water quality samples.
 ### Date Created: Thursday 2nd September 2010
 ### Author: Jason Lessels
 ### Packages required: TSAgg_0.2-1,geoR 
 ### Notes: The script can easily be modified for the other water quality parameters, and the amount of simulations. Things to check:
 ###Both WQ and discharge must have the same initial time stamp (hours) before aggregation.
 ###WQ variable modify lines 52,60,146,186
 ###line 99 changes the amount of simulations to run.
 ###line 156 determines when the bushfire occurred.
 ###lines 200:220 need to specify the names of the likfit models.
 ###Aggregation, 77,40, and the switch statements in 110:130 - is explained.
 #####################################################################################################



 ###Load the TSAgg package.
 install.packages("geoR")
 install.packages("TSAgg")
 library(TSAgg)
 ###Set the working dir.
 setwd("E:/4th YEAR PROJECT/data/Burke")
 setwd("~/Documents/code/francesca/")
 ###load the quality data
 quality<-read.csv("Burke_worked.csv",header=T)
 ###What is the format of the dates?
 head(quality)
 tail(quality)


 ###Load the flow data - takes awhile
 flow<-read.table("2122052.TXT",header=T,sep=",")[,-3]
 ###Fix the names up
 names(flow)<-c("date","discharge")
 #get rid of missing values
 flow<-flow[!is.na(flow[,2]),]
 ##No bug But everything is easier to work with when midnight is the first observation.
 flow<-flow[-c(1:8),]
 ##Check out that all is well
 head(flow)
 ###Convert the flow data into a timeSeries formatted data frame
 flow<-timeSeries(flow$date,"%d/%m/%Y %H:%M",data=flow$discharge)
 ###The aggregation can take some time for long datasets like  this one.
 flow.3hr<-hoursAgg(flow,sum,3)
 ####Make sure all is well
 head(flow.3hr)
 tail(flow.3hr)
 ##Change the names of the aggregated data frame.
 names(flow.3hr)<-c("date","discharge")

 ###format the time stamps
 q2<-timeSeries(quality$date,"%d/%m/%Y %H:%M")
 ###Add the quality values to the new timeSeries object, as the timeSeries function dosen't allow for factors (Sample.Type)
 q2<-data.frame(q2,quality[,3:7])
 ###Get rid of any missing TP values, as this is what we are interested in.
 q2<-q2[!is.na(q2$Phosphorus.Total..mg.L.),]
 ###Subset all samples that are collected between midnight and 12.59.59
 temp<-q2[q2[,"hour"]==0,]
 ###Remove all automatically collected samples, as it is highly possible for an automatic sample to have been collected at this time.
 midnight.wq<-temp[temp[,"Sample.Type"]!="A",]
 ###Create a dataset that is the opposite of the above so the two can be joined after the random samples have been produced. This means that all samples not collected at midnight, or they have been collected using an Automatic sampler.
 other.wq<-q2[q2[,"hour"]!=0|q2[,"Sample.Type"]=="A",]
 ###Grab the date and the TP column
 other.wq<-other.wq[,c(1,12)]
 head(other.wq)
 tail(other.wq)

 #########################################################
 ###Test to see if the above separation worked properly
 ###Check that all is well, by checking the lengths of the two subsetted data frames and comparing to that of the quality tp object (q2)
 length(midnight.wq[,1])
 length(other.wq[,1])
 ##Is the length of all quality samples the same as the length of the problem and non problem samples.
 length(q2[,1])==(length(other.wq[,1])+length(midnight.wq[,1]))
 #########################################################


 ###Merge the flow with the water quality that dosent have midnight observations.
 ###Sort the dates so they are in chronological order.
 other.wq<-other.wq[order(other.wq[,1]),]
 ###Format the dates so we can change the hour for merging with flow
 qual<-timeSeries(other.wq$dates,"%Y-%m-%d %H:%M:%S")
 ###Get the aggregated hourly block that the quality observation falls into in respect to the hourly aggregation of the flow.
 temphour<-floor(qual$hour/3)*3
 ###Create a temp time stamp for merging with the flow - this will be changed back after the merging takes place
 temptime<-strptime(paste(qual$day,qual$month,qual$year,temphour,0),"%d %m %Y %H %M",tz="GMT")
 #Create a temp qual object for th merging with the new aggregated dates.
 tempqual<-data.frame(date=temptime,tp=other.wq[,2])
 ###Merge the quality with flow
 temp<-merge(flow.3hr,tempqual,by="date",all.y=T)
 #Now change the dates back to the original time stamp of the quality observation
 temp$date<-qual$date
 ###Now get rid of the missing flow values.
 other.wq<-temp[!is.na(temp[,2]),]




 #############################################################################################################
 ##########Below randomly allocates a discharge value from the aggregated flow data, between 8am and 5pm.#####
 #############################################################################################################

 ###This sets how many realisations you want this can be changed to anything you want...
 simulations<-100
 ###This creates a matrix for the randomly sampled flow
 new.data<-matrix(ncol=simulations,nrow=length(midnight.wq[,1]))
 ###Create an object for the dates that the sampled flow represents.
 dates<-matrix(ncol=simulations,nrow=length(midnight.wq[,1]))
 ##Cycle through each of the midnight samples 
 for(i in 1:length(midnight.wq[,1])){
 	#Check to see if flow is recorded on this day and only continue if it is.
 	if(length(flow.3hr[(flow.3hr[,1]==as.POSIXct(as.Date(midnight.wq[i,1]))),1])==1){
 		#Get the row number in the larger flow dataset, for midnight.
 		day.id<-as.numeric(rownames(flow.3hr[(flow.3hr[,1]==as.POSIXct(as.Date(midnight.wq[i,1]))),]))
 		#Random sample which hourly aggregated block to use, do this for the length of simulations. 
 		#|1|2,2,2|3,3,3|4,4| = |8am|9am,10am,11am|12pm,1pm,2pm|3pm,4pm| - therefore each hour has equal probabilty, but the blocks arent of equal probablity.
 		sampled<-sample(c(1,2,2,2,3,3,3,4,4),simulations,replace=T)
 		#Now cycle through the cols of the current midnight sample (i) and sample based on the sampled value for this col.
 		for(j in 1:simulations){
 ###Use a switch statement. There nice and quick and the one slected is based on the sampled value of this col.
 			#First switch statment gets the randomly allocated flow value.			
 			switch(sampled[j],
 				new.data[i,j]<-flow.3hr[(day.id+2),2],
 				new.data[i,j]<-flow.3hr[(day.id+3),2],
 				new.data[i,j]<-flow.3hr[(day.id+4),2],
 				new.data[i,j]<-flow.3hr[(day.id+5),2]
 			)
 			#Second switch statement gets the time of the randomly allocated flow.
 			switch(sampled[j],
 				dates[i,j]<-paste(flow.3hr[(day.id+2),1]),
 				dates[i,j]<-paste(flow.3hr[(day.id+3),1]),
 				dates[i,j]<-paste(flow.3hr[(day.id+4),1]),	
 				dates[i,j]<-paste(flow.3hr[(day.id+5),1])
 			)
 		}
 	}
 }



 ############Now merge the newly randomly created value with the good values.

 ################################
 ### Put it all back together ###
 ################################


 ##Create a list for to store the simulated data for each realisation
 all.output<-list()
 for(i in 1:simulations){
 	#Create the dataset for this realisation
 	sim.<-data.frame(date=dates[,i],discharge=new.data[,i],tp=midnight.wq$Phosphorus.Total..mg.L.)
 	#Get rid of the missing values
 	sim.<-sim.[!is.na(sim.[,2]),]
 	
 	#join this realisation with the unchanged observations
 	together<-rbind(other.wq,sim.)
 	#Now get the dates in order.
 	together<-together[order(together[,1]),]
 	together$numericdist<-as.numeric(difftime(together[,1],together[1,1],units="days"))
 	together$bush.fire<-as.numeric(together[,1]>as.POSIXct("2002-01-01 00:00:00"))
 	all.output[[i]]<-together
 }


 ##########################################################
 ##########################################################
 ################### MODEL FITTING ########################
 ##########################################################
 ##########################################################

 #####First trend is just flow

 ini.<-matrix(NA,ncol=2,nrow=7)
 ini.[,1]=c(0.1,0.1,0.1,0.1,0.1,0.1,0.1)
 ini.[,2]=c(2,3,4,5,6,7,8)
 likfitmods<-list()
 likfitmods[["flowspatial"]]<-vector()
 likfitmods[["firespatial"]]<-vector()
 likfitmods[["flowfirespatial"]]<-vector()
 Geo<-list()
 library(geoR)

 for(i in 1:simulations){#testing
 	
 	#Get lambda values for the boxcox transformtion
 	#lambda<-boxcox.fit(all.output[[i]]$tp)$lambda
 	#flowlambda<-boxcox.fit(all.output[[i]]$discharge)$lambda
 	##Using a log transform change lambda to 0 for log transform.
 	lambda=0
 	flowlambda=0
 	#Create a data frame for creating geoR object
 	tempData<-data.frame(x=all.output[[i]]$numericdist,y=1,tp=all.output[[i]]$tp,flw=BCtransform(all.output[[i]]$discharge,flowlambda)$data,bfire=as.factor(all.output[[i]]$bush.fire))
 	#Get rid of any duplicates using the jitter function. Note that the latest geoR package is required.
 	tempData[,1:2]<-jitterDupCoords(tempData[,1:2],max=0.01)
 	#Create a geodata object of the data.
 	Geo[[i]]<-as.geodata(tempData,coords.col=1:2,data.col=3,covar.col=4:5)
 ###Plot a variogram of the data, using lambda to transform the data
 #plot(variog(tempGeo,lambda=lambda,max.dist=30,trend=~fre))
 ###Use likfit to optimise the variogram model.
 	
 	
 ##############################
 ##############################
 ####### The models ###########	
 ##############################
 ##############################
 ##The flow model
 	likfitmods[["flow"]][[i]]<-likfit(Geo[[i]],ini=ini.,fix.nugget = F,fix.lambda=T, lik.method = "REML",lambda=lambda,trend=trend.spatial(~flw,Geo[[i]]),cov.model="matern")
 #Check to see if the model has a temporal trend.	
 	if(summary(likfitmods[["flow"]][[i]])$likelihood$AIC<summary(likfitmods[["flow"]][[i]])$ns.likelihood$AIC)likfitmods[["flowspatial"]][i]=1 else likfitmods[["flowspatial"]][i]<-0
 ##The fire model
 	likfitmods[["fire"]][[i]]<-likfit(Geo[[i]],ini=ini.,fix.nugget = F,fix.lambda=T, lik.method = "REML",lambda=lambda,trend=trend.spatial(~flw+bfire,Geo[[i]]),cov.model="matern")
 #Check to see if the model has a temporal trend.	
 	if(summary(likfitmods[["fire"]][[i]])$likelihood$AIC<summary(likfitmods[["fire"]][[i]])$ns.likelihood$AIC)likfitmods[["firespatial"]][i]=1 else likfitmods[["firespatial"]][i]<-0
 ##The flow and fire model
 	likfitmods[["flowfire"]][[i]]<-likfit(Geo[[i]],ini=ini.,fix.nugget = F,fix.lambda=T, lik.method = "REML",lambda=lambda,trend=trend.spatial(~flw*bfire,Geo[[i]]),cov.model="matern")
 #Check to see if the model has a temporal trend.	
 	if(summary(likfitmods[["flowfire"]][[i]])$likelihood$AIC<summary(likfitmods[["flowfire"]][[i]])$ns.likelihood$AIC)likfitmods[["flowfirespatial"]][i]=1 else likfitmods[["flowfirespatial"]][i]<-0
 	cat("We are now up to", i,"\n")
 #
 	print(i)
 }

 ###Save the output
 save(likfitmods,file="likfitmods_Burke_TP.Rdata")
 save(all.output,file="simulated_data_Burke_TP.Rdata")
 save(Geo,file="JitteredGeoObjects_Burke_TP.Rdata")

 ###Get the P values and determine if the interaction is significant.
	#####################################################################################################
	### Aim: To test the significance of randomly allocating flow to 'midnight' water quality samples.
	### Date Created: Thursday 2nd September 2010
	### Author: Jason Lessels
	### Packages required: TSAgg_0.2-1,geoR
	### Notes: The script can easily be modified for the other water quality parameters, and the amount of simulations. Things to check:
	###Both WQ and discharge must have the same initial time stamp (hours) before aggregation.
	###WQ variable modify lines 52,60,146,186
	###line 99 changes the amount of simulations to run.
	###line 156 determines when the bushfire occurred.
	###lines 200:220 need to specify the names of the likfit models.
	###Aggregation, 77,40, and the switch statements in 110:130 - is explained.
	#####################################################################################################



	###Load the TSAgg package.
	install.packages("geoR")
	install.packages("TSAgg")
	library(TSAgg)
	###Set the working dir.
	setwd("E:/4th YEAR PROJECT/data/Burke")
	setwd("~/Documents/code/francesca/")
	###load the quality data
	quality<-read.csv("Burke_worked.csv",header=T)
	###What is the format of the dates?
	head(quality)
	tail(quality)


	###Load the flow data - takes awhile
	flow<-read.table("2122052.TXT",header=T,sep=",")[,-3]
	###Fix the names up
	names(flow)<-c("date","discharge")
	#get rid of missing values
	flow<-flow[!is.na(flow[,2]),]
	##No bug But everything is easier to work with when midnight is the first observation.
	flow<-flow[-c(1:8),]
	##Check out that all is well
	head(flow)
	###Convert the flow data into a timeSeries formatted data frame
	flow<-timeSeries(flow$date,"%d/%m/%Y %H:%M",data=flow$discharge)
	###The aggregation can take some time for long datasets like this one.
	flow.3hr<-hoursAgg(flow,sum,3)
	####Make sure all is well
	head(flow.3hr)
	tail(flow.3hr)
	##Change the names of the aggregated data frame.
	names(flow.3hr)<-c("date","discharge")

	###format the time stamps
	q2<-timeSeries(quality$date,"%d/%m/%Y %H:%M")
	###Add the quality values to the new timeSeries object, as the timeSeries function dosen't allow for factors (Sample.Type)
	q2<-data.frame(q2,quality[,3:7])
	###Get rid of any missing TP values, as this is what we are interested in.
	q2<-q2[!is.na(q2$Phosphorus.Total..mg.L.),]
	###Subset all samples that are collected between midnight and 12.59.59
	temp<-q2[q2[,"hour"]==0,]
	###Remove all automatically collected samples, as it is highly possible for an automatic sample to have been collected at this time.
	midnight.wq<-temp[temp[,"Sample.Type"]!="A",]
	###Create a dataset that is the opposite of the above so the two can be joined after the random samples have been produced. This means that all samples not collected at midnight, or they have been collected using an Automatic sampler.
	other.wq<-q2[q2[,"hour"]!=0\|q2[,"Sample.Type"]=="A",]
	###Grab the date and the TP column
	other.wq<-other.wq[,c(1,12)]
	head(other.wq)
	tail(other.wq)

	#########################################################
	###Test to see if the above separation worked properly
	###Check that all is well, by checking the lengths of the two subsetted data frames and comparing to that of the quality tp object (q2)
	length(midnight.wq[,1])
	length(other.wq[,1])
	##Is the length of all quality samples the same as the length of the problem and non problem samples.
	length(q2[,1])==(length(other.wq[,1])+length(midnight.wq[,1]))
	#########################################################


	###Merge the flow with the water quality that dosent have midnight observations.
	###Sort the dates so they are in chronological order.
	other.wq<-other.wq[order(other.wq[,1]),]
	###Format the dates so we can change the hour for merging with flow
	qual<-timeSeries(other.wq$dates,"%Y-%m-%d %H:%M:%S")
	###Get the aggregated hourly block that the quality observation falls into in respect to the hourly aggregation of the flow.
	temphour<-floor(qual$hour/3)*3
	###Create a temp time stamp for merging with the flow - this will be changed back after the merging takes place
	temptime<-strptime(paste(qual$day,qual$month,qual$year,temphour,0),"%d %m %Y %H %M",tz="GMT")
	#Create a temp qual object for th merging with the new aggregated dates.
	tempqual<-data.frame(date=temptime,tp=other.wq[,2])
	###Merge the quality with flow
	temp<-merge(flow.3hr,tempqual,by="date",all.y=T)
	#Now change the dates back to the original time stamp of the quality observation
	temp$date<-qual$date
	###Now get rid of the missing flow values.
	other.wq<-temp[!is.na(temp[,2]),]




	#############################################################################################################
	##########Below randomly allocates a discharge value from the aggregated flow data, between 8am and 5pm.#####
	#############################################################################################################

	###This sets how many realisations you want this can be changed to anything you want...
	simulations<-100
	###This creates a matrix for the randomly sampled flow
	new.data<-matrix(ncol=simulations,nrow=length(midnight.wq[,1]))
	###Create an object for the dates that the sampled flow represents.
	dates<-matrix(ncol=simulations,nrow=length(midnight.wq[,1]))
	##Cycle through each of the midnight samples
	for(i in 1:length(midnight.wq[,1])){
	#Check to see if flow is recorded on this day and only continue if it is.
	if(length(flow.3hr[(flow.3hr[,1]==as.POSIXct(as.Date(midnight.wq[i,1]))),1])==1){
	#Get the row number in the larger flow dataset, for midnight.
	day.id<-as.numeric(rownames(flow.3hr[(flow.3hr[,1]==as.POSIXct(as.Date(midnight.wq[i,1]))),]))
	#Random sample which hourly aggregated block to use, do this for the length of simulations.
	#\|1\|2,2,2\|3,3,3\|4,4\| = \|8am\|9am,10am,11am\|12pm,1pm,2pm\|3pm,4pm\| - therefore each hour has equal probabilty, but the blocks arent of equal probablity.
	sampled<-sample(c(1,2,2,2,3,3,3,4,4),simulations,replace=T)
	#Now cycle through the cols of the current midnight sample (i) and sample based on the sampled value for this col.
	for(j in 1:simulations){
	###Use a switch statement. There nice and quick and the one slected is based on the sampled value of this col.
	#First switch statment gets the randomly allocated flow value.
	switch(sampled[j],
	new.data[i,j]<-flow.3hr[(day.id+2),2],
	new.data[i,j]<-flow.3hr[(day.id+3),2],
	new.data[i,j]<-flow.3hr[(day.id+4),2],
	new.data[i,j]<-flow.3hr[(day.id+5),2]
	)
	#Second switch statement gets the time of the randomly allocated flow.
	switch(sampled[j],
	dates[i,j]<-paste(flow.3hr[(day.id+2),1]),
	dates[i,j]<-paste(flow.3hr[(day.id+3),1]),
	dates[i,j]<-paste(flow.3hr[(day.id+4),1]),
	dates[i,j]<-paste(flow.3hr[(day.id+5),1])
	)
	}
	}
	}



	############Now merge the newly randomly created value with the good values.

	################################
	### Put it all back together ###
	################################


	##Create a list for to store the simulated data for each realisation
	all.output<-list()
	for(i in 1:simulations){
	#Create the dataset for this realisation
	sim.<-data.frame(date=dates[,i],discharge=new.data[,i],tp=midnight.wq$Phosphorus.Total..mg.L.)
	#Get rid of the missing values
	sim.<-sim.[!is.na(sim.[,2]),]

	#join this realisation with the unchanged observations
	together<-rbind(other.wq,sim.)
	#Now get the dates in order.
	together<-together[order(together[,1]),]
	together$numericdist<-as.numeric(difftime(together[,1],together[1,1],units="days"))
	together$bush.fire<-as.numeric(together[,1]>as.POSIXct("2002-01-01 00:00:00"))
	all.output[[i]]<-together
	}


	##########################################################
	##########################################################
	################### MODEL FITTING ########################
	##########################################################
	##########################################################

	#####First trend is just flow

	ini.<-matrix(NA,ncol=2,nrow=7)
	ini.[,1]=c(0.1,0.1,0.1,0.1,0.1,0.1,0.1)
	ini.[,2]=c(2,3,4,5,6,7,8)
	likfitmods<-list()
	likfitmods[["flowspatial"]]<-vector()
	likfitmods[["firespatial"]]<-vector()
	likfitmods[["flowfirespatial"]]<-vector()
	Geo<-list()
	library(geoR)

	for(i in 1:simulations){#testing

	#Get lambda values for the boxcox transformtion
	#lambda<-boxcox.fit(all.output[[i]]$tp)$lambda
	#flowlambda<-boxcox.fit(all.output[[i]]$discharge)$lambda
	##Using a log transform change lambda to 0 for log transform.
	lambda=0
	flowlambda=0
	#Create a data frame for creating geoR object
	tempData<-data.frame(x=all.output[[i]]$numericdist,y=1,tp=all.output[[i]]$tp,flw=BCtransform(all.output[[i]]$discharge,flowlambda)$data,bfire=as.factor(all.output[[i]]$bush.fire))
	#Get rid of any duplicates using the jitter function. Note that the latest geoR package is required.
	tempData[,1:2]<-jitterDupCoords(tempData[,1:2],max=0.01)
	#Create a geodata object of the data.
	Geo[[i]]<-as.geodata(tempData,coords.col=1:2,data.col=3,covar.col=4:5)
	###Plot a variogram of the data, using lambda to transform the data
	#plot(variog(tempGeo,lambda=lambda,max.dist=30,trend=~fre))
	###Use likfit to optimise the variogram model.


	##############################
	##############################
	####### The models ###########
	##############################
	##############################
	##The flow model
	likfitmods[["flow"]][[i]]<-likfit(Geo[[i]],ini=ini.,fix.nugget = F,fix.lambda=T, lik.method = "REML",lambda=lambda,trend=trend.spatial(~flw,Geo[[i]]),cov.model="matern")
	#Check to see if the model has a temporal trend.
	if(summary(likfitmods[["flow"]][[i]])$likelihood$AIC<summary(likfitmods[["flow"]][[i]])$ns.likelihood$AIC)likfitmods[["flowspatial"]][i]=1 else likfitmods[["flowspatial"]][i]<-0
	##The fire model
	likfitmods[["fire"]][[i]]<-likfit(Geo[[i]],ini=ini.,fix.nugget = F,fix.lambda=T, lik.method = "REML",lambda=lambda,trend=trend.spatial(~flw+bfire,Geo[[i]]),cov.model="matern")
	#Check to see if the model has a temporal trend.
	if(summary(likfitmods[["fire"]][[i]])$likelihood$AIC<summary(likfitmods[["fire"]][[i]])$ns.likelihood$AIC)likfitmods[["firespatial"]][i]=1 else likfitmods[["firespatial"]][i]<-0
	##The flow and fire model
	likfitmods[["flowfire"]][[i]]<-likfit(Geo[[i]],ini=ini.,fix.nugget = F,fix.lambda=T, lik.method = "REML",lambda=lambda,trend=trend.spatial(~flw*bfire,Geo[[i]]),cov.model="matern")
	#Check to see if the model has a temporal trend.
	if(summary(likfitmods[["flowfire"]][[i]])$likelihood$AIC<summary(likfitmods[["flowfire"]][[i]])$ns.likelihood$AIC)likfitmods[["flowfirespatial"]][i]=1 else likfitmods[["flowfirespatial"]][i]<-0
	cat("We are now up to", i,"\n")
	#
	print(i)
	}

	###Save the output
	save(likfitmods,file="likfitmods_Burke_TP.Rdata")
	save(all.output,file="simulated_data_Burke_TP.Rdata")
	save(Geo,file="JitteredGeoObjects_Burke_TP.Rdata")

	###Get the P values and determine if the interaction is significant.