Last active
May 10, 2021 17:41
-
-
Save mathzero/efc2b43f8c23ae2ac8a77370809e3b6b to your computer and use it in GitHub Desktop.
Generate a data frame with multiple normally distributed correlated variables in X and simulate X-y signals
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(ggplot2) | |
library(dplyr) | |
library(tidyr) | |
library(faux) | |
library(randomForest) | |
# Set parameters ---------------------------------------------------------- | |
set.seed(123) | |
n=100 | |
p=1000 # for final simulation we probably want p>10,000 or even p>100,000. To keep things speedy while testing we can work with 1000 | |
mean=1 | |
sd=1 | |
r_corr=0.5 # determine the level of correlation between variables in the X matrix | |
# Create X and y data ----------------------------------------------------- | |
# Simulate X data using rnorm_multi function | |
X <- rnorm_multi(n = n, | |
mu = mean, | |
sd = sd, | |
r = r_corr, | |
varnames = paste0("X", 1:p), | |
empirical = FALSE) | |
# Visualise the correlation in X (will take a while if P is large) | |
# heatmap(cor(X),Rowv = NA, Colv=NA) | |
# Initiate Y | |
y=rep(0,n) | |
# Add some signal to the y variable with a function of some X variables | |
X1_coef=2 | |
X2_coef=5 | |
X3_coef=1 | |
interact_coef=2 | |
X4_coef=0.3 | |
X4_exponent=2 | |
noise=1 | |
### apply function + noise | |
y= y + X1_coef*X$X1 + X2_coef*X$X2 + X3_coef*X$X3 + interact_coef*X$X2*X$X3 + X4_coef*(X$X4^X4_exponent) +rnorm(n = n,mean = 0,sd = noise) | |
### Visualise y distribution and relationship to X1 | |
hist(y) | |
plot(X$X1,y) | |
### Quick random forest model to see if it picks up signals | |
rf_mod <- randomForest(x = X,y = y,mtry = floor(sqrt(p))) | |
varImp(rf_mod) | |
varImpPlot(rf_mod) | |
# Expect: | |
# X1 16.8971586 | |
# X2 361.2702060 | |
# X3 145.8711742 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment