Created
August 30, 2010 19:25
-
-
Save CerebralMastication/557900 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# example of how to use a Guassian Copula to create random draws from | |
# normally distributed data | |
require("reshape") | |
require("plyr") | |
require("QRMlib") | |
require("Matrix") | |
#make this reproducable | |
set.seed(2) | |
#how many draws in our starting data set? | |
n <- 1e4 | |
# how many draws do we want from this distribution? | |
drawCount <- 1e4 | |
# ourData will be my starting data which we'll | |
# base our model on | |
myData <- rnorm(n, 5, .6) | |
yourData <- myData + rnorm(n, 8, .25) | |
hisData <- myData + rnorm(n, 6, .4) | |
herData <- hisData + rnorm(n, 8, .35) | |
ourData <- data.frame(myData, yourData, hisData, herData) | |
# now we have raw correlations in the 70s, 80s, and 90s. Funny how that | |
# works out | |
cor(ourData) | |
#set up some simple functions for normalizing (Z Score) and | |
#denormalizing | |
normalMoments <- function(t) { | |
as.list(c(mean=mean(t), sd=sd(t))) | |
} | |
normalize <- function(x) { | |
(x - mean(x)) / sd(x) | |
} | |
deNormalize <- function(x, sampleMean, sampleSd) { | |
(x * sampleSd + sampleMean) | |
} | |
# create an object with the mean and sd of each | |
# of the margins... you can do this without plyr | |
# using apply() if you're into that sort of thing | |
# but alply makes it so easy to work with the results | |
# we will use these later to denormalize | |
normalMomentList <- alply(ourData, 2, normalMoments) | |
# normalize i.e. create Z score | |
# I think you don't HAVE to do this with gaussian marginals, | |
# but you DO with non-gaussian... so I'm in the habit. | |
ourDataZ <- data.frame(apply(ourData, 2, normalize)) | |
# now ourDataZ is a data frame of Z scores. | |
# prove this to yourself by checking the | |
# mean and SD of each margin | |
apply(ourDataZ, 2, mean) | |
apply(ourDataZ, 2, sd) | |
#looks like mean of 0 and sd of 1 to me | |
# this fixes positive def, fits a copula, and makes draws | |
# in one line of code. Try that in Excel, bitches. | |
myDraws <- rcopula.gauss(n=drawCount, | |
Sigma=as.matrix(nearPD(Spearman(ourDataZ),corr=TRUE)$mat), | |
d=ncol(ourData)) | |
# rcopula.gauss spits out points from 0-1 (i.e. q values) so we need to turn those into | |
# Z scores by doing a little norm inversion. | |
myDraws <- qnorm(myDraws) | |
#check the mean and sd | |
apply(myDraws, 2, mean) | |
apply(myDraws, 2, sd) | |
#should be 0, 1... and they are close.. | |
# but we can do better than that... we know statistics! | |
# let's do a Kolmogorov-Smirnov test to see if these | |
# samples come from the same distribution | |
# KS does not check correlation, | |
# it only tests if two sets of samples | |
# came from same dist.. we'll check each column | |
for (i in 1:ncol(ourData)){ | |
print(ks.test(myDraws[[i]], ourDataZ[[i]])) | |
} | |
# if the p-value of the KS test is < .05 then we | |
# reject that the distributions are equal | |
# they all look > .05 to me. | |
# Kolmogorov-Smirnov makes me want to drink, for some odd reason | |
# If your starting sample is small, you'll notice that a couple of the variables | |
# fail or just barely pass the KS test. This is common because KS is non-parametric and | |
# the starting sample will be 'lumpy' and not that big. If starting n gets up | |
# to, say, 10K, then they all do better. That's why I started with 10K and still | |
# it can be hit or miss | |
# so we have a metric shit ton of random draws. But they are Z scores | |
# and we want them put back in their original shapes. So let's do that: | |
myDrawsDenorm <- myDraws | |
for (i in 1:ncol(myDrawsDenorm)) { | |
myDrawsDenorm[,i] <- deNormalize(myDraws[,i], | |
normalMomentList[[i]][[1]], | |
normalMomentList[[i]][[2]]) | |
} | |
myDrawsDenorm <- data.frame(myDrawsDenorm) | |
names(myDrawsDenorm) <- names(ourData) | |
#let's look at the mean and standard dev of the starting data | |
apply(ourData, 2, mean) | |
apply(ourData, 2, sd) | |
#compare that with our sample data | |
apply(myDrawsDenorm, 2, mean) | |
apply(myDrawsDenorm, 2, sd) | |
# so myDrawsDenorm contains the final draws | |
# let's check Kolmogorov-Smirnov between the starting data | |
# and the final draws | |
for (i in 1:ncol(ourData)){ | |
print(ks.test(myDrawsDenorm[[i]], ourData[[i]])) | |
} | |
# holy shit. it works! | |
#look at the correlation matrices | |
cor(myDrawsDenorm) | |
cor(ourData) | |
#it's fun to plot the variables and see if the PDFs line up | |
#you could do this for each variable. It's a good sanity check. | |
plot(density(myDrawsDenorm$myData)) | |
lines(density(ourData$myData), col="red") | |
# there's a test to see if the corr matricices are the same | |
# but I'm too lazy to google for it |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment