inkhorn · September 13, 2013 00:31
diff --git a/ebike.r b/ebike.r
 library(rpart)
 library(plyr)
 library(rpart.plot)

 ebike = read.csv("E-Bike_Survey_Responses.csv")

 # This next part is strictly to change any blank responses into NAs
 ebike[,2:10][ebike[,2:10] == ''] = NA

 # In this section we use mapvalues from the plyr package to get rid of blanks, but also 
 # to reduce the number of values in each factor that we use.  

 ebike$Sex = mapvalues(ebike$Sex, c('',levels(ebike$Sex)[-c(1,2,6)]), c('Other', rep("Other",10)))
 ebike$Health = mapvalues(ebike$How.would.you.describe.your.level.of.physical.health., c('', levels(ebike$How.would.you.describe.your.level.of.physical.health.)[-c(1,4,5,6,12,13)]), c(NA, rep("Other",7)))
 ebike$Edu = mapvalues(ebike[,5], c('', levels(ebike[,5])[-c(1,4,8,14,23)]), c(NA, rep('Other',20)))
 ebike$Income = mapvalues(ebike[,6], '', NA)
 ebike$Age = mapvalues(ebike[,2], '', NA)

 # People put a lot of varying answers in here, but the categories I've chosen here can be found in most of them.
 ebike$transport = factor(ifelse(grepl("bicycle",ebike[,11]),"Bicycle", 
                  ifelse(grepl("e-bike", ebike[,11]), "E-bike",
                  ifelse(grepl("car", ebike[,11]), "Car",
                  ifelse(grepl("transit", ebike[,11]), "Transit","Other")))))

 # Here we ask R to make two trees based first on Sex Health and Age (they seem like they go together)
 # then based on education and income.  You can try to put them together, but you will find that only some are 
 # chosen as the most significant for the classification.  Therefore, keeping them apart describes for us
 # E-bike users on separate dimensions.

 b = rpart(transport == "E-bike"~ Sex + Health + Age, data=ebike)
 c = rpart(transport == "E-bike" ~ Edu + Income, data=ebike)

 # And here we plot the two Partition Tree models.  I like seeing the factor label
 # values in their entirety, so I've chosen a large enough number for the 'faclen' argument
 # in each call to rpart.plot

 rpart.plot(b, type=1,extra=1, varlen=0, faclen=10)
 rpart.plot(c, type=1,extra=1, varlen=0, faclen=20)
	library(rpart)
	library(plyr)
	library(rpart.plot)

	ebike = read.csv("E-Bike_Survey_Responses.csv")

	# This next part is strictly to change any blank responses into NAs
	ebike[,2:10][ebike[,2:10] == ''] = NA

	# In this section we use mapvalues from the plyr package to get rid of blanks, but also
	# to reduce the number of values in each factor that we use.

	ebike$Sex = mapvalues(ebike$Sex, c('',levels(ebike$Sex)[-c(1,2,6)]), c('Other', rep("Other",10)))
	ebike$Health = mapvalues(ebike$How.would.you.describe.your.level.of.physical.health., c('', levels(ebike$How.would.you.describe.your.level.of.physical.health.)[-c(1,4,5,6,12,13)]), c(NA, rep("Other",7)))
	ebike$Edu = mapvalues(ebike[,5], c('', levels(ebike[,5])[-c(1,4,8,14,23)]), c(NA, rep('Other',20)))
	ebike$Income = mapvalues(ebike[,6], '', NA)
	ebike$Age = mapvalues(ebike[,2], '', NA)

	# People put a lot of varying answers in here, but the categories I've chosen here can be found in most of them.
	ebike$transport = factor(ifelse(grepl("bicycle",ebike[,11]),"Bicycle",
	ifelse(grepl("e-bike", ebike[,11]), "E-bike",
	ifelse(grepl("car", ebike[,11]), "Car",
	ifelse(grepl("transit", ebike[,11]), "Transit","Other")))))

	# Here we ask R to make two trees based first on Sex Health and Age (they seem like they go together)
	# then based on education and income. You can try to put them together, but you will find that only some are
	# chosen as the most significant for the classification. Therefore, keeping them apart describes for us
	# E-bike users on separate dimensions.

	b = rpart(transport == "E-bike"~ Sex + Health + Age, data=ebike)
	c = rpart(transport == "E-bike" ~ Edu + Income, data=ebike)

	# And here we plot the two Partition Tree models. I like seeing the factor label
	# values in their entirety, so I've chosen a large enough number for the 'faclen' argument
	# in each call to rpart.plot

	rpart.plot(b, type=1,extra=1, varlen=0, faclen=10)
	rpart.plot(c, type=1,extra=1, varlen=0, faclen=20)