Last active
March 12, 2016 13:35
-
-
Save stephlocke/ffd179813534db9c77a1 to your computer and use it in GitHub Desktop.
Presentation code for building models iteratively
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| --- | |
| title: "Stats 101" | |
| output: | |
| revealjs::revealjs_presentation: | |
| theme: white | |
| --- | |
| ```{r setup, include=FALSE} | |
| knitr::opts_chunk$set(echo = TRUE, message = FALSE, dev="svg", | |
| fig.height = 500) | |
| library(ggplot2) | |
| library(vegalite) | |
| library(DT) | |
| library(readxl) | |
| library(data.table) | |
| vegalite <- vegalite() %>% | |
| cell_size(700, 400) | |
| Mode <- function(x) { | |
| ux <- unique(x) | |
| ux[which.max(tabulate(match(x, ux)))] | |
| } | |
| ``` | |
| # Aim | |
| ## Aim | |
| - Predict the ages of SQL Saturday Exeter attendees | |
| # Our data | |
| ## Our data | |
| We have the results from our survey. | |
| ```{r} | |
| results<-setDT(read_excel("C:/Users/steph/OneDrive/Predicting Age.xlsx")) | |
| results<-results[Age<100,] | |
| ``` | |
| Our sample has `r nrow(results)` records. | |
| ## Results | |
| ```{r} | |
| DT::datatable(results) | |
| ``` | |
| ## Age Distribution | |
| ```{r} | |
| vegalite %>% | |
| add_data(results) %>% | |
| encode_x("Age","quantitative") %>% | |
| encode_y("*", "quantitative", aggregate="count") %>% | |
| bin_x(maxbins=10) %>% mark_bar() | |
| ``` | |
| ## Experience Distribution | |
| ```{r} | |
| vegalite %>% | |
| add_data(results) %>% | |
| encode_x("Experience","quantitative") %>% | |
| encode_y("*", "quantitative", aggregate="count") %>% | |
| bin_x(maxbins=10) %>% mark_bar() | |
| ``` | |
| ## Field Distribution | |
| ```{r} | |
| vegalite %>% | |
| add_data(results) %>% | |
| encode_x("Field","ordinal") %>% | |
| encode_y("*", "quantitative", aggregate="count") %>% | |
| mark_bar() | |
| ``` | |
| ## All data | |
| ```{r} | |
| vegalite %>% | |
| add_data(results) %>% | |
| encode_x("Experience","quantitative") %>% | |
| encode_y("Age", "quantitative") %>% | |
| encode_color("Field", "nominal") %>% | |
| mark_point() | |
| ``` | |
| # Sampling | |
| ```{r} | |
| testPer<-.75 | |
| inTest<-sample(1:nrow(results),nrow(results)*testPer) | |
| training<-results[inTest,] | |
| holdout<-results[!inTest,] | |
| ``` | |
| ## Training | |
| ```{r} | |
| vegalite %>% | |
| add_data(training) %>% | |
| encode_x("Experience","quantitative") %>% | |
| encode_y("Age", "quantitative") %>% | |
| encode_color("Field", "nominal") %>% | |
| mark_point() | |
| ``` | |
| ## Holdout | |
| ```{r} | |
| vegalite %>% | |
| add_data(holdout) %>% | |
| encode_x("Experience","quantitative") %>% | |
| encode_y("Age", "quantitative") %>% | |
| encode_color("Field", "nominal") %>% | |
| mark_point() | |
| ``` | |
| # One size fits all | |
| ## One size fits all | |
| We could take some measure of central tendency to predict the age of attendees. | |
| ```{r} | |
| averages<-training[,.(Mean=floor(mean(Age)) | |
| ,Median=floor(median(Age)) | |
| ,Mode=Mode(Age) | |
| )] | |
| knitr::kable(averages) | |
| ``` | |
| ## Results | |
| ```{r} | |
| holdout[,colnames(averages):=averages] | |
| holdout.m<-melt(holdout, measure.vars = c("Age",colnames(averages))) | |
| vegalite %>% | |
| add_data(holdout.m) %>% | |
| encode_x("Name", "ordinal") %>% | |
| encode_y("value", "quantitative") %>% | |
| encode_color("variable", "nominal")%>% | |
| mark_point() | |
| ``` | |
| ## Assessing results | |
| ```{r} | |
| holdout.lse<-melt(holdout, measure.vars = colnames(averages)) | |
| holdout.lse[,Error:=(Age-value)^2] | |
| knitr::kable(holdout.lse[,.(LSE=sum(Error)), variable][order(LSE)]) | |
| ``` | |
| # Line of best fit | |
| $y=mx+c$ | |
| ```{r} | |
| expLM<-lm(Age~Experience, training) | |
| summary(expLM) | |
| ``` | |
| ## Model | |
| ```{r} | |
| training[,expLMres:=expLM$fitted] | |
| ggplot(training, aes(x=Experience, y=Age))+ | |
| geom_point()+ | |
| geom_line(aes(y=expLMres),colour="blue")+ | |
| theme_minimal() | |
| ``` | |
| ## Results | |
| ```{r} | |
| holdout[,expLMres:=predict(expLM,holdout)] | |
| holdout.m<-melt(holdout, measure.vars = c("Age","expLMres")) | |
| vegalite %>% | |
| add_data(holdout.m) %>% | |
| encode_x("Name", "ordinal") %>% | |
| encode_y("value", "quantitative") %>% | |
| encode_color("variable", "nominal")%>% | |
| mark_point() | |
| ``` | |
| ## Assessing results | |
| ```{r} | |
| holdout.lse<-melt(holdout, measure.vars = c("expLMres",colnames(averages))) | |
| holdout.lse[,Error:=(Age-value)^2] | |
| knitr::kable(holdout.lse[,.(LSE=sum(Error)), variable][order(LSE)]) | |
| ``` | |
| # Complex linear regression | |
| ## Complex linear regression | |
| - $y=m_1 x_1 +...+ m_n x_n +c$ | |
| - Contrasts | |
| ```{r} | |
| fieldLM<-lm(Age~Experience + Field, training) | |
| summary(fieldLM) | |
| ``` | |
| ## Model | |
| ```{r} | |
| training[,fieldLMres:=fieldLM$fitted] | |
| ggplot(training, aes(x=Experience, y=Age, group=Field, colour=Field))+ | |
| geom_point()+ | |
| geom_line(aes(y=fieldLMres, group=Field),colour="blue")+ | |
| facet_wrap(~Field)+ | |
| theme_minimal() | |
| ``` | |
| ## Results | |
| ```{r} | |
| holdout[,fieldLMres:=predict(fieldLM,holdout)] | |
| holdout.m<-melt(holdout, measure.vars = c("Age","fieldLMres")) | |
| vegalite %>% | |
| add_data(holdout.m) %>% | |
| encode_x("Name", "ordinal") %>% | |
| encode_y("value", "quantitative") %>% | |
| encode_color("variable", "nominal")%>% | |
| mark_point() | |
| ``` | |
| ## Assessing results | |
| ```{r} | |
| holdout.lse<-melt(holdout, measure.vars = c("fieldLMres","expLMres",colnames(averages))) | |
| holdout.lse[,Error:=(Age-value)^2] | |
| knitr::kable(holdout.lse[,.(LSE=sum(Error)), variable][order(LSE)]) | |
| ``` | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment