Skip to content

Instantly share code, notes, and snippets.

@diamonaj
diamonaj / CS112 11.1
Last active November 14, 2016 20:44
CS112 11.1
### RESULTS FOR CS112 LESSON PLAN 11.1 Version 2
rm(list=ls())
set.seed(13345)
### TO CREATE THE DATA (LATER SEPARATED INTO TEST AND TRAINING SETS)
x <- round(runif(100, -4, 3), 2)
y = x^3 + 2*x^2 - 5*x - 3
y <- jitter(y, 3000) # to add some random noise (i.e., the epsilon in the regression equation)
@diamonaj
diamonaj / CS112 11.1 Revised
Created November 15, 2016 13:33
CS112 11.1 Revised
### RESULTS FOR CS112 LESSON PLAN 11.1
rm(list=ls())
set.seed(355)
num_data <- 200
### TO CREATE THE DATA (LATER SEPARATED INTO TEST AND TRAINING SETS)
x <- round(runif(num_data, -4, 3), 2)
y = x^3 + 2*x^2 - 5*x - 3
dev.off()
### REGRESSION TO THE MEAN
set.seed(12345)
dev.off()
### Generate "TRUE ABILITY"
# N people, all with "zero" ability
N <- 20
left.panel <- c(23, 18, 14, 10, 11, 10, 9, 10, 10, 9)
par(mfrow = c(1,2))
plot(x = c(1:10), left.panel, type = "l", lty = 1, lwd = 3,
ylim = c(5, 25), ylab = "mean squared error", xlab = "degree of polynomial")
plot(x = c(1:10), left.panel, type = "l", lty = 1, lwd = 3,
ylim = c(5, 25), ylab = "mean squared error", xlab = "degree of polynomial")
for(i in 1:6) {
lines(x = c(1:10), y = jitter(left.panel, 10), col = i)
}
# Download TITANIC data, loading stringsAsFactors = FALSE
mm <- read.csv("trainTitanic.csv", stringsAsFactors = FALSE)
# delete columns (Name, Cabin, Ticket, PassengerId, SibSp, Parch, Embarked)
mm <- mm[,-c(1, 4, 7, 8, 9, 11, 12)]
# dimensions are 891 x 5
mm <- na.omit(mm)
# dimensinos are 714 x 5
data(lalonde); library(randomForest)
# create lalonde2 (just control units); delete orig data to avoid mistakeslalonde2 <- lalonde[which(lalonde$treat == 0),]; rm(lalonde)
# eliminate the treatment indicator variable (they are all control units)# remove additional columns--we are going to predict "u75" (unemployed in '75)elimin.cols <- which( names(lalonde2) == "treat" | names(lalonde2) == "re75"| names(lalonde2) == "re78")
lalonde2 <- lalonde2[, -elimin.cols]
# make the dependent variable (what we are trying to predict)# a factor, because random forest will perform classification# for factors, not for 'numeric' variables.# notice that I set the levels when I define this factor...# otherwise, by default, R would set the first factor value# of the first value appearing in the dataset as "0"# even though it is actually a "1"--this could be very confusing.lalonde2$u75 <- factor(lalonde2$u75, levels = c(1,0))
data(lalonde); library(randomForest)
# create lalonde2 (just control units); delete orig data to avoid mistakes
lalonde2 <- lalonde[which(lalonde$treat == 0),]; rm(lalonde)
# eliminate the treatment indicator variable (they are all control units)
# remove additional columns--we are going to predict "u75" (unemployed in '75)
elimin.cols <- which( names(lalonde2) == "treat" |
names(lalonde2) == "re75"|
names(lalonde2) == "re78")
# NOTE TO INSTRUCTOR: READ ENTIRE FILE BEFORE CLASS
# RUN CODE FROM LINE 49 TO END BEFORE CLASS
# CONSIDER THE FOLLOWING GAME...
# IMAGINE YOU WANT TO ESTIMATE THE UNDERLYING PARAMETER VALUE
# OF A BERNOULLI DISTRIBUTION (i.e., THE PROBABILITY OF A "1"
# INSTEAD OF A "0"...)
# YOU CAN PULL THE "CORR" LEVER AND OBTAIN 50 CORRELATED DATA
# POINTS FROM THE BERNOULLI DISTRIBUTION
set.seed(123)
# create storage vectors
# to store the means for each experiment
storage.corr <- c()
storage.uncorr <- c()
# "PULL THE *CORR* LEVER"
# correlated data
# A video accompanying this code is available (to those with Minerva Schools at KGI emails) here:
# https://drive.google.com/file/d/17X9FTMDZdKKuSI0Amo93J0-QvcPgmeL5/view?usp=sharing
rm(list = ls()) # clears working memory
# To get the data
# set the working directory
setwd("~/Downloads")