Created
January 29, 2018 12:44
-
-
Save anonymous/fabecccf33f9c3feb568384f626a2c07 to your computer and use it in GitHub Desktop.
Code for correlations article
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Detecting correlation | |
# Defines three functions using base R to illustrate techniques for identifying correlations | |
# between continuous random variables, then tests against different types of data | |
# Pearsons r, distance correlation, Maximal Information Coefficient (approximated) | |
# A simple bootstrap function to estimate confidence intervals | |
bootstrap <- function(x,y,func,reps,alpha){ | |
estimates <- c() | |
original <- data.frame(x,y) | |
N <- dim(original)[1] | |
for(i in 1:reps){ | |
S <- original[sample(1:N, N, replace = TRUE),] | |
estimates <- append(estimates, func(S$x, S$y)) | |
} | |
l <- alpha/2 ; u <- 1 - l | |
interval <- quantile(estimates, c(u, l)) | |
return(2*(func(x,y)) - as.numeric(interval[1:2])) | |
} | |
# Pearson's r | |
Pearsons <- function(x,y){ | |
x <- x - mean(x); y <- y - mean(y) | |
dotProduct <- 0 | |
for(i in 1:length(x)){ | |
dotProduct <- dotProduct + (x[i] * y[i]) | |
} | |
magnitudeX <- sqrt(sum(x**2)) | |
magnitudeY <- sqrt(sum(y**2)) | |
cosTheta <- dotProduct / (magnitudeX * magnitudeY) | |
return(round(cosTheta,4)) | |
} | |
# Distance Correlation | |
distanceCorrelation<- function(x,y){ | |
# Function to double center the data | |
doubleCenter <- function(x){ | |
centered <- x | |
for(i in 1:dim(x)[1]){ | |
for(j in 1:dim(x)[2]){ | |
centered[i,j] = x[i,j] - mean(x[i,]) - mean(x[,j]) + mean(x) | |
} | |
} | |
return(centered) | |
} | |
# Calculate the distance covariance | |
distanceCovariance <- function(x,y){ | |
N <- length(x) | |
distX <- as.matrix(dist(x)) | |
distY <- as.matrix(dist(y)) | |
centeredX <- doubleCenter(distX) | |
centeredY <- doubleCenter(distY) | |
calc = sum(centeredX * centeredY) | |
return(sqrt(calc/(N^2))) | |
} | |
# Use distanceCovariance(x,x) to get distance Variance of x | |
distanceVariance <- function(x){return(distanceCovariance(x,x))} | |
# Find numerator and denominator, return distance correlation | |
cov = distanceCovariance(x,y) | |
sd = sqrt(distanceVariance(x)*distanceVariance(y)) | |
return(round(cov/sd ,4)) | |
} | |
# Maximal Information Coefficient | |
MIC <- function(x,y){ | |
# Get joint distribution of x and y | |
jointDist <- function(x,y){ | |
N <- length(x) | |
u <- unique(append(x,y)) | |
joint <- c() | |
for(i in u){ | |
for(j in u){ | |
f <- x[paste(x,y) == paste(i,j)] | |
joint <- append(joint, length(f)/N) | |
} | |
} | |
return(joint) | |
} | |
# Calculate the product of the marginal distributions of x and y | |
marginalProduct <- function(x,y){ | |
N <- length(x) | |
u <- unique(append(x,y)) | |
marginal <- c() | |
for(i in u){ | |
for(j in u){ | |
fX <- length(x[x == i]) / N | |
fY <- length(y[y == j]) / N | |
marginal <- append(marginal, fX * fY) | |
} | |
} | |
return(marginal) | |
} | |
# Mutual Information of x and y | |
mutualInfo <- function(x,y){ | |
joint <- jointDist(x,y) | |
marginal <- marginalProduct(x,y) | |
Hjm <- - sum(joint[marginal > 0] * log(marginal[marginal > 0],2)) | |
Hj <- - sum(joint[joint > 0] * log(joint[joint > 0],2)) | |
return(Hjm - Hj) | |
} | |
# Start binning x and y to find MIC | |
N <- length(x) | |
maxBins <- ceiling(N ** 0.6) | |
MI <- c() | |
for(i in 2:maxBins) { | |
for (j in 2:maxBins){ | |
if(i * j > maxBins){ | |
next | |
} | |
Xbins <- i; Ybins <- j | |
binnedX <-cut(x, breaks=Xbins, labels = 1:Xbins) | |
binnedY <-cut(y, breaks=Ybins, labels = 1:Ybins) | |
MI_estimate <- mutualInfo(binnedX,binnedY) | |
MI_normalized <- MI_estimate / log(min(Xbins,Ybins),2) | |
MI <- append(MI, MI_normalized) | |
} | |
} | |
return(round(max(MI),4)) | |
} | |
# Test data | |
set.seed(123) | |
# Noise | |
x0 <- rnorm(100,0,1) | |
y0 <- rnorm(100,0,1) | |
plot(y0~x0, pch =18) | |
cor(x0,y0) | |
distanceCorrelation(x0,y0) | |
MIC(x0,y0) | |
# Simple linear relationship | |
x1 <- -20:20 | |
y1 <- x1 + rnorm(41,0,4) | |
plot(y1~x1, pch =18) | |
Pearsons(x1,y1) | |
distanceCorrelation(x1,y1) | |
MIC(x1,y1) | |
# y ~ x**2 | |
x2 <- -20:20 | |
y2 <- x2**2 + rnorm(41,0,40) | |
plot(y2~x2, pch = 18) | |
Pearsons(x2,y2) | |
distanceCorrelation(x2,y2) | |
MIC(x2,y2) | |
# Cosine | |
x3 <- -20:20 | |
y3 <- cos(x3/4) + rnorm(41,0,0.2) | |
plot(y3~x3, type='p', pch=18) | |
abline(nls(y3~cos(x3/4)),col='red',lwd=2) | |
cor(x3,y3) | |
Pearsons(x3,y3) | |
distanceCorrelation(x3,y3) | |
MIC(x3,y3) | |
# Circle | |
n <- 50 | |
theta <- runif (n, 0, 2*pi) | |
x4 <- append(cos (theta) , cos(theta)) | |
y4 <- append( sin (theta), -sin (theta)) | |
plot(x4,y4, pch=18) | |
Pearsons(x4,y4) | |
distanceCorrelation(x4,y4) | |
MIC(x4,y4) | |
cor(x4,y4) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Hi, I have used the given code to compute the distance correlation of my two generated variables in basic R. Although I have adjusted the chunk to my specific variables R will not give me a result but only defines the function - What might be the problem here?