Created
October 3, 2018 11:13
-
-
Save josealvarez97/2f0f805345fcc1ad79e6fa0316c2b701 to your computer and use it in GitHub Desktop.
JoseAlvarezCabrera-MakeUpWork-Session3.2-CS112
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Logistic Regression and lalonde data set | |
library(Matching) | |
data(lalonde) | |
head(lalonde) | |
summary(lalonde) | |
View(lalonde) | |
??lalonde | |
# Note: For simplicity, all operations will be performed over the training model. | |
# Let's fit a logit model. | |
lalonde.glm.fit = glm(treat~age+educ,data=lalonde,family=binomial) | |
# Probabilities of the response being true. treat=1 | |
glm.probs = predict(lalonde.glm.fit,type="response") | |
#(PROPENSITY SCORES) | |
glm.probs[1:10] # this vector contains the probabilities of treat = 1 for each observation. | |
# Let's have a vector with readable labels... | |
dim(lalonde) | |
glm.pred=rep("NoTreat",445) | |
glm.pred[glm.probs>.5]="Treat" | |
glm.pred | |
# Let's determine the fraction of correct predictions (RIGHT NOW, THIS DOES NOT MAKE SENSE "ENTIRELY" AS WE ARE DOING IT WITH THE TRAINING SET...for simplicity) | |
table(glm.pred,lalonde$treat) | |
(253+9)/445 # equals 0.588764 | |
#or | |
#mean(glm.pred==lalonde$treat) # WRONG, in data set treat is either 1 or 0 | |
mean( (glm.pred=="Treat")&(lalonde$treat==1) ) + mean( (glm.pred=="NoTreat")&(lalonde$treat==0) ) | |
# a bit messy calculation but it shows that it worked. (Both equaled 0.588764) | |
# (Fraction of correct predictions) | |
# Again, this could be improved by cutting data set into training and testint set... | |
# Perhaps I should not even change / use labels "NoTreat" and "Treat" and rather just 0 and 1 for simplicity... too. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# R Lab in Textbook | |
install.packages(("ISLR")) | |
library(ISLR) | |
names(Smarket) | |
dim(Smarket) | |
summary(Smarket) | |
head(Smarket) | |
cor(Smarket) #Gives error message, as Direction is qualitative | |
head(Smarket[,-9]) | |
cor(Smarket[,-9]) | |
#attach(Smarket) for calling just plot(Volume) | |
plot(Smarket$Volume) | |
# Logistic Regression | |
# To predict Direction based on Lag1, Lag5, and Volume. | |
# glm() generalized linear models | |
# that include logistic regression | |
# just as ln(), but we pass family=binomial | |
# for specifying a logistic model | |
glm.fit = glm(Direction~Lag1+Lag2+Lag3+Lag4+Lag5+Volume, data=Smarket, family=binomial) | |
summary(glm.fit) | |
# we use coef() in order to ACCESS THE COEFFICIENTS | |
# OF THIS FITTED MODEL. | |
coef(glm.fit) | |
# we can also use summary to access a particular aspect | |
# of the model. | |
summary(glm.fit)$coef | |
summary(glm.fit)$coef[,4] | |
# predict() | |
## to predict probabilities based on predictors | |
## Note: if no dataset is supplied to predict(), | |
## the training data that was used to fit the model | |
## is used by the function. | |
## type="response" tells R to output probabilities | |
## of the form P(Y=1|X). | |
glm.probs = predict(glm.fit,type="response") | |
glm.probs[1:10] | |
# we know that these values correspond to the | |
# probabilities of the market going up, as | |
# contrasts() function indicates that R has | |
# created a dummy variables with a 1 for Up | |
contrasts(Direction) | |
# To make predictions, we covert predicted probabilities | |
# into class labels | |
# Create a vector of 1250 Down elements | |
glm.pred=rep("Down",1250) | |
# Transforms to Up all elements for which the | |
# predicted probability of market increase exceeds .5 | |
glm.pred[glm.probs>.5]="Up" | |
glm.pred | |
# table() allows to determine how many are each | |
table(glm.pred,Direction) | |
# diagonal elements indicate correct predictions. | |
(507+145)/1250 | |
# mean can also be used to compute fraction of | |
# days for which the prediction was correct. | |
mean(glm.pred==Direction) | |
# Testing the model by using the training set is misleading | |
# Let's hold out a part of the data set to use as the test set later. | |
train=(Year<2005) | |
View(train) | |
Smarket.2005=Smarket[!train,] #held out data or test set | |
dim(Smarket.2005) # "dimensions of array" | |
View(Smarket.2005) | |
Direction.2005=Direction[!train] #only direction column | |
View(Direction) #cool... | |
# We now fit a logistic regression model using | |
# only the subset of observations before 2005 (training set) | |
glm.fit=glm(Direction~Lag1+Lag2+Lag3+Lag4+Lag5+Volume,data=Smarket,family=binomial,subset=train) | |
# We then obtain predicted probabilities of stock market | |
# going up for each of the days in the test set (days in 2005) | |
glm.probs=predict(glm.fit,Smarket.2005,type="response") | |
# TRAINING WAS PERFORMED FOR DATES < 2005 | |
# TESTING WAS PERFORMED FOR DATES IN 2005 | |
glm.pred=rep("Down",252) | |
glm.pred[glm.probs>.5]="Up" | |
table(glm.pred,Direction.2005) | |
mean(glm.pred==Direction.2005) # actual percentage of effectivity | |
mean(glm.pred!=Direction.2005) # fraction of errors... test set error rate | |
# 0.52 worse than random guessing... | |
# Below we remove all but Lag1 and Lag2, | |
# which seemed to have the highest predictive power | |
glm.fit=glm(Direction~Lag1+Lag2,data=Smarket,family=binomial,subset=train) | |
glm.probs=predict(glm.fit,Smarket.2005,type="response") | |
glm.pred=rep("Down",252) | |
glm.pred[glm.probs>.5]="Up" | |
table(glm.pred,Direction.2005) | |
mean(glm.pred==Direction.2005) #56% of daily movements correctly predicted | |
106/(106+76)#58% for predicting increases in market | |
# IF WE WANT TO PREDICT FOR SPECIFIC VALUES | |
predict(glm.fit,newdata=data.frame(Lag1=c(1.2,1.5),Lag2=c(1.1,-0.8)),type="response") | |
predict(glm.fit,newdata=data.frame(Lag1=1.2,Lag2=1.1),type="response") | |
predict(glm.fit,newdata=data.frame(Lag1=1.5,Lag2=-0.8),type="response") | |
# Logistic Regression and lalonde data set | |
library(Matching) | |
data(lalonde) | |
head(lalonde) | |
summary(lalonde) | |
View(lalonde) | |
??lalonde | |
# Note: For simplicity, all operations will be performed over the training model. | |
# Let's fit a logit model. | |
lalonde.glm.fit = glm(treat~age+educ,data=lalonde,family=binomial) | |
# Probabilities of the response being true. treat=1 | |
glm.probs = predict(lalonde.glm.fit,type="response") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment