Created
September 13, 2013 00:31
-
-
Save inkhorn/6545575 to your computer and use it in GitHub Desktop.
E-bike Survey Analysis
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(rpart) | |
library(plyr) | |
library(rpart.plot) | |
ebike = read.csv("E-Bike_Survey_Responses.csv") | |
# This next part is strictly to change any blank responses into NAs | |
ebike[,2:10][ebike[,2:10] == ''] = NA | |
# In this section we use mapvalues from the plyr package to get rid of blanks, but also | |
# to reduce the number of values in each factor that we use. | |
ebike$Sex = mapvalues(ebike$Sex, c('',levels(ebike$Sex)[-c(1,2,6)]), c('Other', rep("Other",10))) | |
ebike$Health = mapvalues(ebike$How.would.you.describe.your.level.of.physical.health., c('', levels(ebike$How.would.you.describe.your.level.of.physical.health.)[-c(1,4,5,6,12,13)]), c(NA, rep("Other",7))) | |
ebike$Edu = mapvalues(ebike[,5], c('', levels(ebike[,5])[-c(1,4,8,14,23)]), c(NA, rep('Other',20))) | |
ebike$Income = mapvalues(ebike[,6], '', NA) | |
ebike$Age = mapvalues(ebike[,2], '', NA) | |
# People put a lot of varying answers in here, but the categories I've chosen here can be found in most of them. | |
ebike$transport = factor(ifelse(grepl("bicycle",ebike[,11]),"Bicycle", | |
ifelse(grepl("e-bike", ebike[,11]), "E-bike", | |
ifelse(grepl("car", ebike[,11]), "Car", | |
ifelse(grepl("transit", ebike[,11]), "Transit","Other"))))) | |
# Here we ask R to make two trees based first on Sex Health and Age (they seem like they go together) | |
# then based on education and income. You can try to put them together, but you will find that only some are | |
# chosen as the most significant for the classification. Therefore, keeping them apart describes for us | |
# E-bike users on separate dimensions. | |
b = rpart(transport == "E-bike"~ Sex + Health + Age, data=ebike) | |
c = rpart(transport == "E-bike" ~ Edu + Income, data=ebike) | |
# And here we plot the two Partition Tree models. I like seeing the factor label | |
# values in their entirety, so I've chosen a large enough number for the 'faclen' argument | |
# in each call to rpart.plot | |
rpart.plot(b, type=1,extra=1, varlen=0, faclen=10) | |
rpart.plot(c, type=1,extra=1, varlen=0, faclen=20) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment