lecy · July 15, 2016 16:11
diff --git a/Regression_in_R.Rmd b/Regression_in_R.Rmd
 ---
 title: "Regression in R"
 output: html_notebook: default
 ---




 # create some fake data

 ```{r}

 x1 <- 1:100

 x2 <- -0.1*x1 + rnorm(100)

 x3 <- 0.05*x2 + rnorm(100)

 y <- 2*x1 + 10*rnorm(100) + 10*x2

 dat <- data.frame( y, x1, x2, x3 )

 head( dat )

 ```


 # descriptive statistics

 ```{r}

 library( pastecs )

 print( t( stat.desc( dat ) ), digits=3 )

 # To copy and paste into Excel:
 #
 # descriptives <- t( stat.desc(dat) )
 # 
 # write.table( descriptives, "clipboard", sep="\t", row.names=TRUE )

 ```


 ```{r}

 # To create nicely formatted tables for markdown documents use the kable() function

 library( knitr )

 kable( t( stat.desc( dat )[ c(1,4,5,8,9,13), ] ), format="markdown", digits=3 )

 ```



 |   | nbr.val|     min|     max| median|   mean| std.dev|
 |:--|-------:|-------:|-------:|------:|------:|-------:|
 |y  |     100| -11.210| 118.497| 50.591| 52.395|  31.342|
 |x1 |     100|   1.000| 100.000| 50.500| 50.500|  29.011|
 |x2 |     100| -10.851|   1.071| -5.097| -4.964|   3.135|
 |x3 |     100|  -2.548|   1.857| -0.409| -0.380|   1.060|













 # pretty pairs plot

 Convenient visual descriptives:

 ```{r}

 pairs( dat )
 ```

 This one is better:

 ```{r}

 panel.cor <- function(x, y, digits=2, prefix="", cex.cor)
 {
    usr <- par("usr"); on.exit(par(usr))
    par(usr = c(0, 1, 0, 1))
    r <- abs(cor(x, y))
    txt <- format(c(r, 0.123456789), digits=digits)[1]
    txt <- paste(prefix, txt, sep="")
    if(missing(cex.cor)) cex <- 0.8/strwidth(txt)
    
    test <- cor.test(x,y)
    # borrowed from printCoefmat
    Signif <- symnum(test$p.value, corr = FALSE, na = FALSE,
                  cutpoints = c(0, 0.001, 0.01, 0.05, 0.1, 1),
                  symbols = c("***", "**", "*", ".", " "))
    
    text(0.5, 0.5, txt, cex = 1.5 )
    text(.7, .8, Signif, cex=cex, col=2)
 }


 pairs( dat, lower.panel=panel.smooth, upper.panel=panel.cor)

 ```






 # create some fake regression data

 ```{r}

 x <- 1:100
 y <- 2*x + rnorm(100,0,20)

 plot( x, y )

 dum <- sample( c("NJ","NY","MA","PA"), 100, replace=T )


 ```






 # basic regression syntax

 ```{r}
 lm( y ~ x )

 m.01 <- lm( y ~ x )

 summary( m.01 )
 ```




 # nice visual diagnostics

 ```{r}
 par( mfrow=c(2,2) )
 plot( m.01 )
 ```




 # useful model fit functions

 ```{r}


 coefficients( m.01 ) # model coefficients
 confint( m.01, level=0.95) # CIs for model parameters 

 # not run because of long output

 # anova( m.01 ) # anova table 
 # fitted( m.01 ) # predicted values
 # residuals( m.01 ) # residuals
 # influence( m.01 ) # regression diagnostics


 ```



 ```{r, warning=F }

 library( coefplot )

 m.02 <-  lm( y ~ x1 + I(x1^2) + x2 + x3 )

 coefplot(m.02)

 ```




 # pretty output

 ```{r, warning=FALSE}

 # install.packages( "memisc" )

 library( memisc )

 m.02 <- lm( y ~ x + I(x^2) )  # quadratic term
 m.03 <- lm( y ~ x - 1 )       # no intercept term


 pretty.table <- mtable("Model 1"=m.01,"Model 2"=m.02,"Model 3"=m.03,
                  summary.stats=c("R-squared","F","p","N"))

 pretty.table

 ```














 # specification

 ```{r}

 summary( lm( y ~ x1 + x2 + x3 ) )

 # add different functional forms

 # square x1

 summary( lm( y ~ x1 + x1^2 + x2 + x3 ) )  # not right

 summary( lm( y ~ x1 + I(x1^2) + x2 + x3 ) )  # like this

 summary( lm( y ~ log(x1) + x2 + x3 ) )  # log of x1 in formula works fine


 ```

 ```{r}
 # interactions

 summary( lm( y ~ x1 + x2 ) )

 summary( lm( y ~ x1 + x2 + I(x1*x2) ) )

 summary( lm( y ~ x1*x2 ) ) # shortcut
 ```

 ```{r}
 # dummy variables

 summary( lm( y ~ x1 + x2 + x3 + dum ) ) # drop one level

 summary( lm( y ~ x1 + x2 + x3 + dum - 1) ) # keep all, drop intercept
 ```












 # standardized regression coefficients (beta)



 ```{r}
 # install.packages( "lm.beta" )

 library( lm.beta )

 m.01.beta <- lm.beta( m.01 )

 summary( m.01.beta )

 # coef( m.01.beta )


 ```

 ```{r}
 # note the standard error is not standardized - describes regular coefficients not standardized

 summary( m.01 )
 ```





 # or just use the formula:
 ```{r}
 lm.beta <- function( my.mod ) 
 {
    b <- summary(my.mod)$coef[-1, 1]
    sx <- sd( my.mod$model[,-1] )
    sy <- sd( my.mod$model[,1] )
    beta <- b * sx/sy
    return(beta)
 }


 coefficients( m.01 )

 lm.beta( m.01 )
 ```












 # robust standard errors


 ```{r, warning=FALSE}

 # install.packages( "sandwhich" )
 # install.packages( "lmtest" )

 library(sandwich)
 library(lmtest)

 m.01 <- lm( y ~ x )


 # REGULAR STANDARD ERRORS
 summary( m.01 ) # not-robust





 ```

 ```{r}
 # ROBUST STANDARD ERRORS

 # reproduce the Stata default
 coeftest( m.01, vcov=vcovHC(m.01,"HC1") )      # robust; HC1 (Stata default)
 ```



 ```{r}
 # ROBUST STANDARD ERRORS

 # check that "sandwich" returns HC0
 coeftest(m.01, vcov = sandwich)                # robust; sandwich
 coeftest(m.01, vcov = vcovHC(m.01, "HC0"))     # robust; HC0 
 ```


 ```{r}
 # ROBUST STANDARD ERRORS

 # check that the default robust var-cov matrix is HC3
 coeftest(m.01, vcov = vcovHC(m.01))            # robust; HC3 
 coeftest(m.01, vcov = vcovHC(m.01, "HC3"))     # robust; HC3 (default)
 ```
	---
	title: "Regression in R"
	output: html_notebook: default
	---




	# create some fake data

	```{r}

	x1 <- 1:100

	x2 <- -0.1*x1 + rnorm(100)

	x3 <- 0.05*x2 + rnorm(100)

	y <- 2x1 + 10rnorm(100) + 10*x2

	dat <- data.frame( y, x1, x2, x3 )

	head( dat )

	```


	# descriptive statistics

	```{r}

	library( pastecs )

	print( t( stat.desc( dat ) ), digits=3 )

	# To copy and paste into Excel:
	#
	# descriptives <- t( stat.desc(dat) )
	#
	# write.table( descriptives, "clipboard", sep="\t", row.names=TRUE )

	```


	```{r}

	# To create nicely formatted tables for markdown documents use the kable() function

	library( knitr )

	kable( t( stat.desc( dat )[ c(1,4,5,8,9,13), ] ), format="markdown", digits=3 )

	```



	\| \| nbr.val\| min\| max\| median\| mean\| std.dev\|
	\|:--\|-------:\|-------:\|-------:\|------:\|------:\|-------:\|
	\|y \| 100\| -11.210\| 118.497\| 50.591\| 52.395\| 31.342\|
	\|x1 \| 100\| 1.000\| 100.000\| 50.500\| 50.500\| 29.011\|
	\|x2 \| 100\| -10.851\| 1.071\| -5.097\| -4.964\| 3.135\|
	\|x3 \| 100\| -2.548\| 1.857\| -0.409\| -0.380\| 1.060\|













	# pretty pairs plot

	Convenient visual descriptives:

	```{r}

	pairs( dat )
	```

	This one is better:

	```{r}

	panel.cor <- function(x, y, digits=2, prefix="", cex.cor)
	{
	usr <- par("usr"); on.exit(par(usr))
	par(usr = c(0, 1, 0, 1))
	r <- abs(cor(x, y))
	txt <- format(c(r, 0.123456789), digits=digits)[1]
	txt <- paste(prefix, txt, sep="")
	if(missing(cex.cor)) cex <- 0.8/strwidth(txt)

	test <- cor.test(x,y)
	# borrowed from printCoefmat
	Signif <- symnum(test$p.value, corr = FALSE, na = FALSE,
	cutpoints = c(0, 0.001, 0.01, 0.05, 0.1, 1),
	symbols = c("*", "", "*", ".", " "))

	text(0.5, 0.5, txt, cex = 1.5 )
	text(.7, .8, Signif, cex=cex, col=2)
	}


	pairs( dat, lower.panel=panel.smooth, upper.panel=panel.cor)

	```






	# create some fake regression data

	```{r}

	x <- 1:100
	y <- 2*x + rnorm(100,0,20)

	plot( x, y )

	dum <- sample( c("NJ","NY","MA","PA"), 100, replace=T )


	```






	# basic regression syntax

	```{r}
	lm( y ~ x )

	m.01 <- lm( y ~ x )

	summary( m.01 )
	```




	# nice visual diagnostics

	```{r}
	par( mfrow=c(2,2) )
	plot( m.01 )
	```




	# useful model fit functions

	```{r}


	coefficients( m.01 ) # model coefficients
	confint( m.01, level=0.95) # CIs for model parameters

	# not run because of long output

	# anova( m.01 ) # anova table
	# fitted( m.01 ) # predicted values
	# residuals( m.01 ) # residuals
	# influence( m.01 ) # regression diagnostics


	```



	```{r, warning=F }

	library( coefplot )

	m.02 <- lm( y ~ x1 + I(x1^2) + x2 + x3 )

	coefplot(m.02)

	```




	# pretty output

	```{r, warning=FALSE}

	# install.packages( "memisc" )

	library( memisc )

	m.02 <- lm( y ~ x + I(x^2) ) # quadratic term
	m.03 <- lm( y ~ x - 1 ) # no intercept term


	pretty.table <- mtable("Model 1"=m.01,"Model 2"=m.02,"Model 3"=m.03,
	summary.stats=c("R-squared","F","p","N"))

	pretty.table

	```














	# specification

	```{r}

	summary( lm( y ~ x1 + x2 + x3 ) )

	# add different functional forms

	# square x1

	summary( lm( y ~ x1 + x1^2 + x2 + x3 ) ) # not right

	summary( lm( y ~ x1 + I(x1^2) + x2 + x3 ) ) # like this

	summary( lm( y ~ log(x1) + x2 + x3 ) ) # log of x1 in formula works fine


	```

	```{r}
	# interactions

	summary( lm( y ~ x1 + x2 ) )

	summary( lm( y ~ x1 + x2 + I(x1*x2) ) )

	summary( lm( y ~ x1*x2 ) ) # shortcut
	```

	```{r}
	# dummy variables

	summary( lm( y ~ x1 + x2 + x3 + dum ) ) # drop one level

	summary( lm( y ~ x1 + x2 + x3 + dum - 1) ) # keep all, drop intercept
	```












	# standardized regression coefficients (beta)



	```{r}
	# install.packages( "lm.beta" )

	library( lm.beta )

	m.01.beta <- lm.beta( m.01 )

	summary( m.01.beta )

	# coef( m.01.beta )


	```

	```{r}
	# note the standard error is not standardized - describes regular coefficients not standardized

	summary( m.01 )
	```





	# or just use the formula:
	```{r}
	lm.beta <- function( my.mod )
	{
	b <- summary(my.mod)$coef[-1, 1]
	sx <- sd( my.mod$model[,-1] )
	sy <- sd( my.mod$model[,1] )
	beta <- b * sx/sy
	return(beta)
	}


	coefficients( m.01 )

	lm.beta( m.01 )
	```












	# robust standard errors


	```{r, warning=FALSE}

	# install.packages( "sandwhich" )
	# install.packages( "lmtest" )

	library(sandwich)
	library(lmtest)

	m.01 <- lm( y ~ x )


	# REGULAR STANDARD ERRORS
	summary( m.01 ) # not-robust





	```

	```{r}
	# ROBUST STANDARD ERRORS

	# reproduce the Stata default
	coeftest( m.01, vcov=vcovHC(m.01,"HC1") ) # robust; HC1 (Stata default)
	```



	```{r}
	# ROBUST STANDARD ERRORS

	# check that "sandwich" returns HC0
	coeftest(m.01, vcov = sandwich) # robust; sandwich
	coeftest(m.01, vcov = vcovHC(m.01, "HC0")) # robust; HC0
	```


	```{r}
	# ROBUST STANDARD ERRORS

	# check that the default robust var-cov matrix is HC3
	coeftest(m.01, vcov = vcovHC(m.01)) # robust; HC3
	coeftest(m.01, vcov = vcovHC(m.01, "HC3")) # robust; HC3 (default)
	```