CerebralMastication · August 30, 2010 19:25
diff --git a/StochasticExample.R b/StochasticExample.R
 # example of how to use a Guassian Copula to create random draws from
 # normally distributed data

 require("reshape")
 require("plyr")
 require("QRMlib")
 require("Matrix")

 #make this reproducable
 set.seed(2)

 #how many draws in our starting data set?
 n <- 1e4

 # how many draws do we want from this distribution?
 drawCount <- 1e4

 # ourData will be my starting data which we'll 
 # base our model on
 myData   <- rnorm(n, 5, .6)
 yourData <- myData  + rnorm(n, 8, .25)
 hisData  <- myData  + rnorm(n, 6, .4)
 herData  <- hisData + rnorm(n, 8, .35)

 ourData <- data.frame(myData, yourData, hisData, herData)

 # now we have raw correlations in the 70s, 80s, and 90s. Funny how that
 # works out
 cor(ourData)

 #set up some simple functions for normalizing (Z Score) and 
 #denormalizing 
 normalMoments <- function(t) {
 		as.list(c(mean=mean(t), sd=sd(t)))
 }

 normalize <- function(x) {
   (x - mean(x)) / sd(x)
 }

 deNormalize <- function(x, sampleMean, sampleSd) {
    (x   * sampleSd  + sampleMean)
 }


 # create an object with the mean and sd of each
 # of the margins... you can do this without plyr
 # using apply() if you're into that sort of thing
 # but alply makes it so easy to work with the results
 # we will use these later to denormalize
 normalMomentList <- alply(ourData, 2, normalMoments)

 # normalize i.e. create Z score
 # I think you don't HAVE to do this with gaussian marginals,
 # but you DO with non-gaussian... so I'm in the habit. 
 ourDataZ <- data.frame(apply(ourData, 2, normalize))

 # now ourDataZ is a data frame of Z scores.
 # prove this to yourself by checking the
 # mean and SD of each margin

 apply(ourDataZ, 2, mean)
 apply(ourDataZ, 2, sd)

 #looks like mean of 0 and sd of 1 to me

 # this fixes positive def, fits a copula, and makes draws
 # in one line of code. Try that in Excel, bitches. 
 myDraws <- rcopula.gauss(n=drawCount,
                         Sigma=as.matrix(nearPD(Spearman(ourDataZ),corr=TRUE)$mat),
                         d=ncol(ourData))

 # rcopula.gauss spits out points from 0-1 (i.e. q values) so we need to turn those into
 # Z scores by doing a little norm inversion.
 myDraws <- qnorm(myDraws)

 #check the mean and sd
 apply(myDraws, 2, mean)
 apply(myDraws, 2, sd)

 #should be 0, 1... and they are close..

 # but we can do better than that... we know statistics!
 # let's do a Kolmogorov-Smirnov test to see if these
 # samples come from the same distribution
 # KS does not check correlation, 
 # it only tests if two sets of samples
 # came from same dist.. we'll check each column

 for (i in 1:ncol(ourData)){
   print(ks.test(myDraws[[i]], ourDataZ[[i]]))
 }

 # if the p-value of the KS test is < .05 then we
 # reject that the distributions are equal 
 # they all look > .05 to me.
 # Kolmogorov-Smirnov makes me want to drink, for some odd reason
 # If your starting sample is small, you'll notice that a couple of the variables
 # fail or just barely pass the KS test. This is common because KS is non-parametric and
 # the starting sample will be 'lumpy' and not that big. If starting n gets up
 # to, say, 10K, then they all do better. That's why I started with 10K and still
 # it can be hit or miss

 # so we have a metric shit ton of random draws. But they are Z scores
 # and we want them put back in their original shapes. So let's do that:

 myDrawsDenorm <- myDraws
 for (i in 1:ncol(myDrawsDenorm)) {
   myDrawsDenorm[,i] <- deNormalize(myDraws[,i],
                                     normalMomentList[[i]][[1]],
                                     normalMomentList[[i]][[2]])
 }

 myDrawsDenorm <- data.frame(myDrawsDenorm)
 names(myDrawsDenorm) <- names(ourData)

 #let's look at the mean and standard dev of the starting data
 apply(ourData, 2, mean)
 apply(ourData, 2, sd)

 #compare that with our sample data
 apply(myDrawsDenorm, 2, mean)
 apply(myDrawsDenorm, 2, sd)


 # so myDrawsDenorm contains the final draws
 # let's check Kolmogorov-Smirnov between the starting data
 # and the final draws

 for (i in 1:ncol(ourData)){
   print(ks.test(myDrawsDenorm[[i]], ourData[[i]]))
 }

 # holy shit. it works!

 #look at the correlation matrices
 cor(myDrawsDenorm)
 cor(ourData)

 #it's fun to plot the variables and see if the PDFs line up
 #you could do this for each variable. It's a good sanity check. 
 plot(density(myDrawsDenorm$myData))
 lines(density(ourData$myData), col="red")

 # there's a test to see if the corr matricices are the same
 # but I'm too lazy to google for it
	# example of how to use a Guassian Copula to create random draws from
	# normally distributed data

	require("reshape")
	require("plyr")
	require("QRMlib")
	require("Matrix")

	#make this reproducable
	set.seed(2)

	#how many draws in our starting data set?
	n <- 1e4

	# how many draws do we want from this distribution?
	drawCount <- 1e4

	# ourData will be my starting data which we'll
	# base our model on
	myData <- rnorm(n, 5, .6)
	yourData <- myData + rnorm(n, 8, .25)
	hisData <- myData + rnorm(n, 6, .4)
	herData <- hisData + rnorm(n, 8, .35)

	ourData <- data.frame(myData, yourData, hisData, herData)

	# now we have raw correlations in the 70s, 80s, and 90s. Funny how that
	# works out
	cor(ourData)

	#set up some simple functions for normalizing (Z Score) and
	#denormalizing
	normalMoments <- function(t) {
	as.list(c(mean=mean(t), sd=sd(t)))
	}

	normalize <- function(x) {
	(x - mean(x)) / sd(x)
	}

	deNormalize <- function(x, sampleMean, sampleSd) {
	(x * sampleSd + sampleMean)
	}


	# create an object with the mean and sd of each
	# of the margins... you can do this without plyr
	# using apply() if you're into that sort of thing
	# but alply makes it so easy to work with the results
	# we will use these later to denormalize
	normalMomentList <- alply(ourData, 2, normalMoments)

	# normalize i.e. create Z score
	# I think you don't HAVE to do this with gaussian marginals,
	# but you DO with non-gaussian... so I'm in the habit.
	ourDataZ <- data.frame(apply(ourData, 2, normalize))

	# now ourDataZ is a data frame of Z scores.
	# prove this to yourself by checking the
	# mean and SD of each margin

	apply(ourDataZ, 2, mean)
	apply(ourDataZ, 2, sd)

	#looks like mean of 0 and sd of 1 to me

	# this fixes positive def, fits a copula, and makes draws
	# in one line of code. Try that in Excel, bitches.
	myDraws <- rcopula.gauss(n=drawCount,
	Sigma=as.matrix(nearPD(Spearman(ourDataZ),corr=TRUE)$mat),
	d=ncol(ourData))

	# rcopula.gauss spits out points from 0-1 (i.e. q values) so we need to turn those into
	# Z scores by doing a little norm inversion.
	myDraws <- qnorm(myDraws)

	#check the mean and sd
	apply(myDraws, 2, mean)
	apply(myDraws, 2, sd)

	#should be 0, 1... and they are close..

	# but we can do better than that... we know statistics!
	# let's do a Kolmogorov-Smirnov test to see if these
	# samples come from the same distribution
	# KS does not check correlation,
	# it only tests if two sets of samples
	# came from same dist.. we'll check each column

	for (i in 1:ncol(ourData)){
	print(ks.test(myDraws[[i]], ourDataZ[[i]]))
	}

	# if the p-value of the KS test is < .05 then we
	# reject that the distributions are equal
	# they all look > .05 to me.
	# Kolmogorov-Smirnov makes me want to drink, for some odd reason
	# If your starting sample is small, you'll notice that a couple of the variables
	# fail or just barely pass the KS test. This is common because KS is non-parametric and
	# the starting sample will be 'lumpy' and not that big. If starting n gets up
	# to, say, 10K, then they all do better. That's why I started with 10K and still
	# it can be hit or miss

	# so we have a metric shit ton of random draws. But they are Z scores
	# and we want them put back in their original shapes. So let's do that:

	myDrawsDenorm <- myDraws
	for (i in 1:ncol(myDrawsDenorm)) {
	myDrawsDenorm[,i] <- deNormalize(myDraws[,i],
	normalMomentList[[i]][[1]],
	normalMomentList[[i]][[2]])
	}

	myDrawsDenorm <- data.frame(myDrawsDenorm)
	names(myDrawsDenorm) <- names(ourData)

	#let's look at the mean and standard dev of the starting data
	apply(ourData, 2, mean)
	apply(ourData, 2, sd)

	#compare that with our sample data
	apply(myDrawsDenorm, 2, mean)
	apply(myDrawsDenorm, 2, sd)


	# so myDrawsDenorm contains the final draws
	# let's check Kolmogorov-Smirnov between the starting data
	# and the final draws

	for (i in 1:ncol(ourData)){
	print(ks.test(myDrawsDenorm[[i]], ourData[[i]]))
	}

	# holy shit. it works!

	#look at the correlation matrices
	cor(myDrawsDenorm)
	cor(ourData)

	#it's fun to plot the variables and see if the PDFs line up
	#you could do this for each variable. It's a good sanity check.
	plot(density(myDrawsDenorm$myData))
	lines(density(ourData$myData), col="red")

	# there's a test to see if the corr matricices are the same
	# but I'm too lazy to google for it