josealvarez97 · October 3, 2018 11:13
diff --git a/LogitLalondeDataSet.R b/LogitLalondeDataSet.R
 # Logistic Regression and lalonde data set

 library(Matching)
 data(lalonde)
 head(lalonde)
 summary(lalonde)
 View(lalonde)
 ??lalonde

 # Note: For simplicity, all operations will be performed over the training model.

 # Let's fit a logit model.
 lalonde.glm.fit = glm(treat~age+educ,data=lalonde,family=binomial)
 # Probabilities of the response being true. treat=1
 glm.probs = predict(lalonde.glm.fit,type="response")


 #(PROPENSITY SCORES)
 glm.probs[1:10] # this vector contains the probabilities of treat = 1 for each observation.

 # Let's have a vector with readable labels...
 dim(lalonde)
 glm.pred=rep("NoTreat",445)

 glm.pred[glm.probs>.5]="Treat"
 glm.pred


 # Let's determine the fraction of correct predictions (RIGHT NOW, THIS DOES NOT MAKE SENSE "ENTIRELY" AS WE ARE DOING IT WITH THE TRAINING SET...for simplicity)
 table(glm.pred,lalonde$treat)
 (253+9)/445 # equals 0.588764
 #or
 #mean(glm.pred==lalonde$treat) # WRONG, in data set treat is either 1 or 0
 mean( (glm.pred=="Treat")&(lalonde$treat==1) ) + mean( (glm.pred=="NoTreat")&(lalonde$treat==0) )
 # a bit messy calculation but it shows that it worked. (Both equaled 0.588764)
 # (Fraction of correct predictions)
 # Again, this could be improved by cutting data set into training and testint set...

 # Perhaps I should not even change / use labels "NoTreat" and "Treat" and rather just 0 and 1 for simplicity... too. 
diff --git a/TextbookRLab.R b/TextbookRLab.R

 # R Lab in Textbook

 install.packages(("ISLR"))
 library(ISLR)
 names(Smarket)
 dim(Smarket)
 summary(Smarket)
 head(Smarket)
 cor(Smarket) #Gives error message, as Direction is qualitative
 head(Smarket[,-9])
 cor(Smarket[,-9])

 #attach(Smarket) for calling just plot(Volume)
 plot(Smarket$Volume)




 # Logistic Regression

 # To predict Direction based on Lag1, Lag5, and Volume.


 # glm() generalized linear models
 # that include logistic regression

 # just as ln(), but we pass family=binomial
 # for specifying a logistic model


 glm.fit = glm(Direction~Lag1+Lag2+Lag3+Lag4+Lag5+Volume, data=Smarket, family=binomial)
 summary(glm.fit)


 # we use coef() in order to ACCESS THE COEFFICIENTS
 # OF THIS FITTED MODEL.
 coef(glm.fit)


 # we can also use summary to access a particular aspect
 # of the model.
 summary(glm.fit)$coef

 summary(glm.fit)$coef[,4]


 # predict()

 ## to predict probabilities based on predictors

 ## Note: if no dataset is supplied to predict(),
 ## the training data that was used to fit the model
 ## is used by the function.

 ## type="response" tells R to output probabilities
 ## of the form P(Y=1|X).

 glm.probs = predict(glm.fit,type="response")
 glm.probs[1:10]

 # we know that these values correspond to the 
 # probabilities of the market going up, as
 # contrasts() function indicates that R has
 # created a dummy variables with a 1 for Up
 contrasts(Direction)



 # To make predictions, we covert predicted probabilities
 # into class labels
 # Create a vector of 1250 Down elements
 glm.pred=rep("Down",1250)
 # Transforms to Up all elements for which the 
 # predicted probability of market increase exceeds .5
 glm.pred[glm.probs>.5]="Up"
 glm.pred

 # table() allows to determine how many are each
 table(glm.pred,Direction)
 # diagonal elements indicate correct predictions.
 (507+145)/1250
 # mean can also be used to compute fraction of 
 # days for which the prediction was correct.
 mean(glm.pred==Direction)




 # Testing the model by using the training set is misleading
 # Let's hold out a part of the data set to use as the test set later.
 train=(Year<2005)
 View(train)
 Smarket.2005=Smarket[!train,] #held out data or test set
 dim(Smarket.2005) # "dimensions of array"
 View(Smarket.2005)
 Direction.2005=Direction[!train] #only direction column
 View(Direction) #cool...

 # We now fit a logistic regression model using 
 # only the subset of observations before 2005 (training set)
 glm.fit=glm(Direction~Lag1+Lag2+Lag3+Lag4+Lag5+Volume,data=Smarket,family=binomial,subset=train)

 # We then obtain predicted probabilities of stock market
 # going up for each of the days in the test set (days in 2005)
 glm.probs=predict(glm.fit,Smarket.2005,type="response")


 # TRAINING WAS PERFORMED FOR DATES < 2005
 # TESTING WAS PERFORMED FOR DATES IN 2005

 glm.pred=rep("Down",252)
 glm.pred[glm.probs>.5]="Up"
 table(glm.pred,Direction.2005)
 mean(glm.pred==Direction.2005) # actual percentage of effectivity

 mean(glm.pred!=Direction.2005) # fraction of errors... test set error rate
 # 0.52 worse than random guessing...




 # Below we remove all but Lag1 and Lag2,
 # which seemed to have the highest predictive power
 glm.fit=glm(Direction~Lag1+Lag2,data=Smarket,family=binomial,subset=train)
 glm.probs=predict(glm.fit,Smarket.2005,type="response")
 glm.pred=rep("Down",252)
 glm.pred[glm.probs>.5]="Up"
 table(glm.pred,Direction.2005)
 mean(glm.pred==Direction.2005) #56% of daily movements correctly predicted
 106/(106+76)#58% for predicting increases in market



 # IF WE WANT TO PREDICT FOR SPECIFIC VALUES
 predict(glm.fit,newdata=data.frame(Lag1=c(1.2,1.5),Lag2=c(1.1,-0.8)),type="response")

 predict(glm.fit,newdata=data.frame(Lag1=1.2,Lag2=1.1),type="response")
 predict(glm.fit,newdata=data.frame(Lag1=1.5,Lag2=-0.8),type="response")



 # Logistic Regression and lalonde data set

 library(Matching)
 data(lalonde)
 head(lalonde)
 summary(lalonde)
 View(lalonde)
 ??lalonde

 # Note: For simplicity, all operations will be performed over the training model.

 # Let's fit a logit model.
 lalonde.glm.fit = glm(treat~age+educ,data=lalonde,family=binomial)
 # Probabilities of the response being true. treat=1
 glm.probs = predict(lalonde.glm.fit,type="response")
	# Logistic Regression and lalonde data set

	library(Matching)
	data(lalonde)
	head(lalonde)
	summary(lalonde)
	View(lalonde)
	??lalonde

	# Note: For simplicity, all operations will be performed over the training model.

	# Let's fit a logit model.
	lalonde.glm.fit = glm(treat~age+educ,data=lalonde,family=binomial)
	# Probabilities of the response being true. treat=1
	glm.probs = predict(lalonde.glm.fit,type="response")


	#(PROPENSITY SCORES)
	glm.probs[1:10] # this vector contains the probabilities of treat = 1 for each observation.

	# Let's have a vector with readable labels...
	dim(lalonde)
	glm.pred=rep("NoTreat",445)

	glm.pred[glm.probs>.5]="Treat"
	glm.pred


	# Let's determine the fraction of correct predictions (RIGHT NOW, THIS DOES NOT MAKE SENSE "ENTIRELY" AS WE ARE DOING IT WITH THE TRAINING SET...for simplicity)
	table(glm.pred,lalonde$treat)
	(253+9)/445 # equals 0.588764
	#or
	#mean(glm.pred==lalonde$treat) # WRONG, in data set treat is either 1 or 0
	mean( (glm.pred=="Treat")&(lalonde$treat==1) ) + mean( (glm.pred=="NoTreat")&(lalonde$treat==0) )
	# a bit messy calculation but it shows that it worked. (Both equaled 0.588764)
	# (Fraction of correct predictions)
	# Again, this could be improved by cutting data set into training and testint set...

	# Perhaps I should not even change / use labels "NoTreat" and "Treat" and rather just 0 and 1 for simplicity... too.

	# R Lab in Textbook

	install.packages(("ISLR"))
	library(ISLR)
	names(Smarket)
	dim(Smarket)
	summary(Smarket)
	head(Smarket)
	cor(Smarket) #Gives error message, as Direction is qualitative
	head(Smarket[,-9])
	cor(Smarket[,-9])

	#attach(Smarket) for calling just plot(Volume)
	plot(Smarket$Volume)




	# Logistic Regression

	# To predict Direction based on Lag1, Lag5, and Volume.


	# glm() generalized linear models
	# that include logistic regression

	# just as ln(), but we pass family=binomial
	# for specifying a logistic model


	glm.fit = glm(Direction~Lag1+Lag2+Lag3+Lag4+Lag5+Volume, data=Smarket, family=binomial)
	summary(glm.fit)


	# we use coef() in order to ACCESS THE COEFFICIENTS
	# OF THIS FITTED MODEL.
	coef(glm.fit)


	# we can also use summary to access a particular aspect
	# of the model.
	summary(glm.fit)$coef

	summary(glm.fit)$coef[,4]


	# predict()

	## to predict probabilities based on predictors

	## Note: if no dataset is supplied to predict(),
	## the training data that was used to fit the model
	## is used by the function.

	## type="response" tells R to output probabilities
	## of the form P(Y=1\|X).

	glm.probs = predict(glm.fit,type="response")
	glm.probs[1:10]

	# we know that these values correspond to the
	# probabilities of the market going up, as
	# contrasts() function indicates that R has
	# created a dummy variables with a 1 for Up
	contrasts(Direction)



	# To make predictions, we covert predicted probabilities
	# into class labels
	# Create a vector of 1250 Down elements
	glm.pred=rep("Down",1250)
	# Transforms to Up all elements for which the
	# predicted probability of market increase exceeds .5
	glm.pred[glm.probs>.5]="Up"
	glm.pred

	# table() allows to determine how many are each
	table(glm.pred,Direction)
	# diagonal elements indicate correct predictions.
	(507+145)/1250
	# mean can also be used to compute fraction of
	# days for which the prediction was correct.
	mean(glm.pred==Direction)




	# Testing the model by using the training set is misleading
	# Let's hold out a part of the data set to use as the test set later.
	train=(Year<2005)
	View(train)
	Smarket.2005=Smarket[!train,] #held out data or test set
	dim(Smarket.2005) # "dimensions of array"
	View(Smarket.2005)
	Direction.2005=Direction[!train] #only direction column
	View(Direction) #cool...

	# We now fit a logistic regression model using
	# only the subset of observations before 2005 (training set)
	glm.fit=glm(Direction~Lag1+Lag2+Lag3+Lag4+Lag5+Volume,data=Smarket,family=binomial,subset=train)

	# We then obtain predicted probabilities of stock market
	# going up for each of the days in the test set (days in 2005)
	glm.probs=predict(glm.fit,Smarket.2005,type="response")


	# TRAINING WAS PERFORMED FOR DATES < 2005
	# TESTING WAS PERFORMED FOR DATES IN 2005

	glm.pred=rep("Down",252)
	glm.pred[glm.probs>.5]="Up"
	table(glm.pred,Direction.2005)
	mean(glm.pred==Direction.2005) # actual percentage of effectivity

	mean(glm.pred!=Direction.2005) # fraction of errors... test set error rate
	# 0.52 worse than random guessing...




	# Below we remove all but Lag1 and Lag2,
	# which seemed to have the highest predictive power
	glm.fit=glm(Direction~Lag1+Lag2,data=Smarket,family=binomial,subset=train)
	glm.probs=predict(glm.fit,Smarket.2005,type="response")
	glm.pred=rep("Down",252)
	glm.pred[glm.probs>.5]="Up"
	table(glm.pred,Direction.2005)
	mean(glm.pred==Direction.2005) #56% of daily movements correctly predicted
	106/(106+76)#58% for predicting increases in market



	# IF WE WANT TO PREDICT FOR SPECIFIC VALUES
	predict(glm.fit,newdata=data.frame(Lag1=c(1.2,1.5),Lag2=c(1.1,-0.8)),type="response")

	predict(glm.fit,newdata=data.frame(Lag1=1.2,Lag2=1.1),type="response")
	predict(glm.fit,newdata=data.frame(Lag1=1.5,Lag2=-0.8),type="response")



	# Logistic Regression and lalonde data set

	library(Matching)
	data(lalonde)
	head(lalonde)
	summary(lalonde)
	View(lalonde)
	??lalonde

	# Note: For simplicity, all operations will be performed over the training model.

	# Let's fit a logit model.
	lalonde.glm.fit = glm(treat~age+educ,data=lalonde,family=binomial)
	# Probabilities of the response being true. treat=1
	glm.probs = predict(lalonde.glm.fit,type="response")