Created
May 2, 2013 01:01
-
-
Save inkhorn/5499509 to your computer and use it in GitHub Desktop.
Casino Analysis
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(ff) | |
library(ffbase) | |
library(stringr) | |
library(ggplot2) | |
library(ggthemes) | |
library(reshape2) | |
library(RgoogleMaps) | |
# Loading 2 copies of the same data set so that I can convert one and have the original for its text values | |
casino = read.csv("/home/inkhorn/Downloads/casino_survey_results20130325.csv") | |
casino.orig = read.csv("/home/inkhorn/Downloads/casino_survey_results20130325.csv") | |
# Here's the dataset of canadian postal codes and latitude/longitude coordinates | |
pcodes = read.csv.ffdf(file="/home/inkhorn/Downloads/zipcodeset.txt", first.rows=50000, next.rows=50000, colClasses=NA, header=FALSE) | |
# I'm doing some numerical recoding here. If you can tell me a cleaner way of doing this | |
# then by all means please do. I found this process really annoyingly tedious. | |
casino$Q1_A = ifelse(casino.orig$Q1_A == "Neutral or Mixed Feelings", 3, | |
ifelse(casino.orig$Q1_A == "Somewhat in Favour", 4, | |
ifelse(casino.orig$Q1_A == "Somewhat Opposed", 2, | |
ifelse(casino.orig$Q1_A == "Strongly in Favour", 5, | |
ifelse(casino.orig$Q1_A == "Strongly Opposed", 1,NA))))) | |
casino$Q2_A = ifelse(casino.orig$Q2_A == "Does Not Fit My Image At All", 1, | |
ifelse(casino.orig$Q2_A == "Neutral / I am Not Sure",2, | |
ifelse(casino.orig$Q2_A == "Fits Image Somewhat", 3, | |
ifelse(casino.orig$Q2_A == "Fits Image Perfectly", 4, NA)))) | |
for (i in 8:24) { | |
casino[,i] = ifelse(casino.orig[,i] == "Not Important At All", 1, | |
ifelse(casino.orig[,i] == "Somewhat Important", 2, | |
ifelse(casino.orig[,i] == "Very Important", 3,NA)))} | |
for (i in c(31:32,47,48,63,64)) { | |
casino[,i] = ifelse(casino.orig[,i] == "Highly Suitable",5, | |
ifelse(casino.orig[,i] == "Neutral or Mixed Feelings",3, | |
ifelse(casino.orig[,i] == "Somewhat Suitable",4, | |
ifelse(casino.orig[,i] == "Somewhat Unsuitable",2, | |
ifelse(casino.orig[,i] == "Strongly Unsuitable",1,NA)))))} | |
# There tended to be blank responses in the original dataset. When seeking to | |
# plot the responses in their original text option format, I got rid of them in some cases, | |
# or coded them in "Did not disclose" in others. | |
casino.orig$Q1_A[casino.orig$Q1_A == ""] = NA | |
casino.orig$Q1_A = factor(casino.orig$Q1_A, levels=c("Strongly Opposed","Somewhat Opposed","Neutral or Mixed Feelings","Somewhat in Favour","Strongly in Favour")) | |
# Here's the graph showing how people feel about a new casino | |
ggplot(subset(casino.orig, !is.na(Q1_A)), aes(x=Q1_A,y=..count../sum(..count..))) + geom_bar(fill="forest green") + coord_flip() + ggtitle("How do you feel about having a new casino in Toronto?") + scale_x_discrete(name="") + theme_wsj() + theme(title=element_text(size=22),plot.title=element_text(hjust=.8)) + stat_bin(aes(label = sprintf("%.02f %%", ..count../sum(..count..)*100)), geom="text") + scale_y_continuous(labels=percent) | |
# How does the casino fit into your image of toronto... | |
ggplot(subset(casino.orig, Q2_A!= ''), aes(x=Q2_A,y=..count../sum(..count..))) + geom_bar(fill="forest green") + coord_flip() + ggtitle("How does a new casino in Toronto fit your image of the City of Toronto?") + scale_x_discrete(name="") + theme_wsj() + theme(title=element_text(size=22),plot.title=element_text(hjust=.8)) + stat_bin(aes(label = sprintf("%.02f %%", ..count../sum(..count..)*100)),geom="text") + scale_y_continuous(labels=percent) | |
# Where you'd prefer to see it located | |
ggplot(subset(casino.orig, Q6!= ''), aes(x=Q6,y=..count../sum(..count..))) + geom_bar(fill="forest green") + coord_flip() + ggtitle("If a casino is built, where would you prefer to see it located?") + scale_x_discrete(name="") + theme_wsj() + theme(title=element_text(size=22),plot.title=element_text(hjust=.8)) + stat_bin(aes(label = sprintf("%.02f %%", ..count../sum(..count..)*100)), geom="text") + scale_y_continuous(labels=percent) | |
# Here I reorder the text labels from the questions asking about suitability of the downtown location | |
casino.orig$Q7_A_StandAlone = reorder(casino.orig$Q7_A_StandAlone, casino$Q7_A_StandAlone) | |
casino.orig$Q7_A_Integrated = reorder(casino.orig$Q7_A_Integrated, casino$Q7_A_Integrated) | |
# Reshaping the downtown ratings data for graphing.. | |
stand.and.integrated.ratings.downtown = cbind(prop.table(as.matrix(table(casino.orig$Q7_A_StandAlone)[1:5])), | |
prop.table(as.matrix(table(casino.orig$Q7_A_Integrated)[1:5]))) | |
colnames(stand.and.integrated.ratings.downtown) = c("Standalone Casino","Integrated Entertainment Complex") | |
stand.and.integrated.ratings.downtown.long = melt(stand.and.integrated.ratings.downtown, varnames=c("Rating","Casino Type"), value.name="Percentage") | |
# Graphing ratings of casino suitability for the downtown location | |
ggplot(stand.and.integrated.ratings.downtown.long, aes(x=stand.and.integrated.ratings.downtown.long$"Casino Type", fill=Rating, y=Percentage,label=sprintf("%.02f %%", Percentage*100))) + geom_bar(position="dodge") + coord_flip() + ggtitle("Ratings of Casino Suitability \nin Downtown Toronto by Casino Type") + scale_x_discrete(name="") + theme(title=element_text(size=22),plot.title=element_text(hjust=.8)) + scale_y_continuous(labels=percent) + geom_text(aes(x=stand.and.integrated.ratings.downtown.long$"Casino Type", y=Percentage, ymax=Percentage, label=sprintf("%.01f%%",Percentage*100), hjust=.75),position = position_dodge(width=1),size=4) + scale_fill_few(palette="light") + theme_wsj() | |
# Reshaping the exhibition place ratings for graphing | |
stand.and.integrated.ratings.exhibition = cbind(prop.table(as.matrix(table(casino.orig$Q7_B_StandAlone)[2:6])), | |
prop.table(as.matrix(table(casino.orig$Q7_B_Integrated)[2:6]))) | |
colnames(stand.and.integrated.ratings.exhibition) = c("Standalone Casino","Integrated Entertainment Complex") | |
stand.and.integrated.ratings.exhibition.long = melt(stand.and.integrated.ratings.exhibition, varnames=c("Rating","Casino Type"), value.name="Percentage") | |
# Reordering the rating text labels for the graphing. | |
stand.and.integrated.ratings.exhibition.long$Rating = factor(stand.and.integrated.ratings.exhibition.long$Rating, levels=levels(casino.orig$Q7_A_StandAlone)[1:5]) | |
# Graphing ratings of casino suitability for the exhibition place location | |
ggplot(stand.and.integrated.ratings.exhibition.long, aes(x=stand.and.integrated.ratings.exhibition.long$"Casino Type", fill=Rating, y=Percentage,label=sprintf("%.02f %%", Percentage*100))) + geom_bar(position="dodge") + coord_flip() + ggtitle("Ratings of Casino Suitability \nat Exhibition Place by Casino Type") + scale_x_discrete(name="") + theme(title=element_text(size=22),plot.title=element_text(hjust=.8)) + scale_y_continuous(labels=percent) + geom_text(aes(x=stand.and.integrated.ratings.exhibition.long$"Casino Type", y=Percentage, ymax=Percentage, label=sprintf("%.01f%%",Percentage*100), hjust=.75), position = position_dodge(width=1),size=4) + scale_fill_few(palette="light") + theme_wsj() | |
# Reshaping the Port Lands ratings for graphing | |
stand.and.integrated.ratings.portlands = cbind(prop.table(as.matrix(table(casino.orig$Q7_C_StandAlone)[2:6])), | |
prop.table(as.matrix(table(casino.orig$Q7_C_Integrated)[2:6]))) | |
colnames(stand.and.integrated.ratings.portlands) = c("Standalone Casino", "Integrated Entertainment Complex") | |
stand.and.integrated.ratings.portlands.long = melt(stand.and.integrated.ratings.portlands, varnames=c("Rating","Casino Type"), value.name="Percentage") | |
# Reording the rating text labels for the graping. | |
stand.and.integrated.ratings.portlands.long$Rating = factor(stand.and.integrated.ratings.portlands.long$Rating, levels=levels(casino.orig$Q7_A_StandAlone)[1:5]) | |
# Graphing ratings of casino suitability for the port lands location | |
ggplot(stand.and.integrated.ratings.portlands.long, aes(x=stand.and.integrated.ratings.portlands.long$"Casino Type", fill=Rating, y=Percentage,label=sprintf("%.02f %%", Percentage*100))) + geom_bar(position="dodge") + coord_flip() + ggtitle("Ratings of Casino Suitability \nat Port Lands by Casino Type") + scale_x_discrete(name="") + theme(title=element_text(size=22),plot.title=element_text(hjust=.8)) + scale_y_continuous(labels=percent) + geom_text(aes(x=stand.and.integrated.ratings.portlands.long$"Casino Type", y=Percentage, ymax=Percentage, label=sprintf("%.01f%%",Percentage*100), hjust=.75), position = position_dodge(width=1),size=4) + scale_fill_few(palette="light") + theme_wsj() | |
# This was the part in my analysis where I looked at postal codes (FSAs really) and their coordinates | |
# Sorry I'm not more linear in how I do my analysis vs. write about it :) | |
# You'll notice that I've imported the geocode file as ffdf. This led to faster merging with the | |
# original casino data set. This meant that I had to coerce the casino.orig data frame into ffdf format | |
# But I work with it every day at work, so I'm used to it by now, despite its idiosynchracies. | |
casino.orig$PostalCode = toupper(casino.orig$PostalCode) | |
pcodes = read.csv.ffdf(file="/home/inkhorn/Downloads/zipcodeset.txt", first.rows=50000, next.rows=50000, colClasses=NA, header=FALSE) | |
names(pcodes) = c("Postal","Lat","Long","City","Prov") | |
pcodes$FSA = as.ff(as.factor(toupper(substr(pcodes[,"Postal"], 1,3)))) | |
casino.orig = as.ffdf(casino.orig) | |
casino.orig$PostalCode = as.ff(as.factor(toupper(casino.orig[,"PostalCode"]))) | |
casino.orig = merge(casino.orig, pcodes, by.x="PostalCode", by.y="FSA", all.x=TRUE) | |
# This is the code for the full map I generated | |
casino.gc = casino.orig[which(!is.na(casino.orig[,"Lat"])),] # making sure only records with coordinates are included... | |
mymap = MapBackground(lat=casino.gc$Lat, lon=casino.gc$Long) | |
PlotOnStaticMap(mymap, casino.gc$Lat, casino.gc$Long, cex=1.5, pch=21, bg="orange") | |
# Here I'm getting a list of cities, winnowing it down, and using it to filter the | |
# geocode coordinates to zoom in on the map I generated. | |
cities = data.frame(table(casino.orig[,"City"])) | |
cities = cities[cities$Freq > 0,] | |
cities = cities[order(cities$Freq, decreasing=TRUE),] | |
cities = cities[cities$Var1 != '',] | |
cities.filter = cities[1:28,] # Here's my top cities variable (i set an arbitrary dividing line...) | |
names(cities.filter) = c("City","# Responses") | |
# Here's where I filtered the original casino ffdf so that it only contained the cities | |
# that I wanted to see in Southern Ontario | |
casino.top.so = casino.orig[which(casino.orig[,"City"] %in% cities.filter$Var1),] | |
# here's a transparency function that I used for the southern ontario map | |
addTrans <- function(color,trans) | |
{ | |
# This function adds transparancy to a color. | |
# Define transparancy with an integer between 0 and 255 | |
# 0 being fully transparant and 255 being fully visable | |
# Works with either color and trans a vector of equal length, | |
# or one of the two of length 1. | |
if (length(color)!=length(trans)&!any(c(length(color),length(trans))==1)) stop("Vector lengths not correct") | |
if (length(color)==1 & length(trans)>1) color <- rep(color,length(trans)) | |
if (length(trans)==1 & length(color)>1) trans <- rep(trans,length(color)) | |
num2hex <- function(x) | |
{ | |
hex <- unlist(strsplit("0123456789ABCDEF",split="")) | |
return(paste(hex[(x-x%%16)/16+1],hex[x%%16+1],sep="")) | |
} | |
rgb <- rbind(col2rgb(color),trans) | |
res <- paste("#",apply(apply(rgb,2,num2hex),2,paste,collapse=""),sep="") | |
return(res) | |
} | |
# Finally here's the southern ontario map code | |
mymap = MapBackground(lat=casino.top.so$Lat, lon=casino.top.so$Long) | |
PlotOnStaticMap(mymap, casino.top.so$Lat, casino.top.so$Long, cex=1.5, pch=21, bg=addTrans("orange",10)) | |
# Here's some code for summarizing and plotting the response data to the question | |
# around issues of importance regarding the new casino (question 3) | |
q3.summary = matrix(NA, 16,1,dimnames=list(c("Design of the facility", | |
"Employment opportunities","Entertainment and cultural activities", | |
"Expanded convention facilities", "Integration with surrounding areas", | |
"New hotel accommodations","Problem gambling & health concerns", | |
"Public safety and social concerns","Public space", | |
"Restaurants","Retail","Revenue for the City","Support for local businesses", | |
"Tourist attraction","Traffic concerns","Training and career development"),c("% Very Important"))) | |
for (i in 8:23) { | |
q3.summary[i-7] = mean(casino[,i] == 3, na.rm=TRUE)} | |
q3.summary = as.data.frame(q3.summary[order(q3.summary[,1], decreasing = FALSE),]) | |
names(q3.summary)[1] = "% Very Important" | |
q3.summary$Concern = rownames(q3.summary) | |
q3.summary = q3.summary[order(q3.summary$"% Very Important", decreasing=FALSE),] | |
q3.summary$Concern = factor(q3.summary$Concern, levels=q3.summary$Concern) | |
ggplot(q3.summary, aes(x=Concern, y=q3.summary$"% Very Important")) + geom_point(size=5, colour="forest green") + coord_flip() + ggtitle("Issues of Importance Surrounding\nthe New Casino") + scale_x_discrete(name="Issues of Importance") + theme(title=element_text(size=22),plot.title=element_text(hjust=.8)) + scale_y_continuous(labels=percent) + theme_wsj() | |
# This chunk of code deals with summarizing and plotting the questions surrounding | |
# what features people might want if a new Integrated Entertainment Complex is built | |
q7a.summary = matrix(NA, 9,1, dimnames=list(c("No Casino","Casino Only", "Convention Centre Space", "Cultural and Arts Facilities", | |
"Hotel","Nightclubs","Restaurants","Retail","Theatre"),c("% Include"))) | |
for (i in 36:44) { | |
q7a.summary[i-35] = mean(casino[,i], na.rm=TRUE)} | |
q7a.summary = as.data.frame(q7a.summary[order(q7a.summary[,1], decreasing = FALSE),]) | |
names(q7a.summary)[1] = "% Include" | |
q7a.summary$feature = rownames(q7a.summary) | |
q7a.summary$feature = factor(q7a.summary$feature, levels=q7a.summary$feature) | |
ggplot(q7a.summary, aes(x=feature, y=q7a.summary$"% Include")) + geom_point(size=5, colour="forest green") + coord_flip() + ggtitle("What People Would Want in an Integrated\nEntertainment Complex in Downtown Toronto") + scale_x_discrete(name="Features") + theme(title=element_text(size=22),plot.title=element_text(hjust=.8)) + scale_y_continuous(labels=percent,name="% Wanting the Feature") + theme_wsj() | |
q7b.summary = matrix(NA, 9,1, dimnames=list(c("No Casino","Casino Only", "Convention Centre Space", "Cultural and Arts Facilities", | |
"Hotel","Nightclubs","Restaurants","Retail","Theatre"),c("% Include"))) | |
for (i in 52:60) { | |
q7b.summary[i-51] = mean(casino[,i], na.rm=TRUE)} | |
q7b.summary = as.data.frame(q7b.summary[order(q7b.summary[,1], decreasing = FALSE),]) | |
names(q7b.summary)[1] = "% Include" | |
q7b.summary$feature = rownames(q7b.summary) | |
q7b.summary$feature = factor(q7b.summary$feature, levels=q7b.summary$feature) | |
ggplot(q7b.summary, aes(x=feature, y=q7b.summary$"% Include")) + geom_point(size=5, colour="forest green") + coord_flip() + ggtitle("What People Would Want in an Integrated\nEntertainment Complex at the Exhbition Place") + scale_x_discrete(name="Features") + theme(title=element_text(size=22),plot.title=element_text(hjust=.8)) + scale_y_continuous(labels=percent,name="% Wanting the Feature") + theme_wsj() | |
q7c.summary = matrix(NA, 9,1, dimnames=list(c("No Casino","Casino Only", "Convention Centre Space", "Cultural and Arts Facilities", | |
"Hotel","Nightclubs","Restaurants","Retail","Theatre"),c("% Include"))) | |
for (i in 68:76) { | |
q7c.summary[i-67] = mean(casino[,i], na.rm=TRUE)} | |
q7c.summary = as.data.frame(q7c.summary[order(q7c.summary[,1], decreasing = FALSE),]) | |
names(q7c.summary)[1] = "% Include" | |
q7c.summary$feature = rownames(q7c.summary) | |
q7c.summary$feature = factor(q7c.summary$feature, levels=q7c.summary$feature) | |
ggplot(q7c.summary, aes(x=feature, y=q7b.summary$"% Include")) + geom_point(size=5, colour="forest green") + coord_flip() + ggtitle("What People Would Want in an Integrated\nEntertainment Complex in Port Lands") + scale_x_discrete(name="Features") + theme(title=element_text(size=22),plot.title=element_text(hjust=.8)) + scale_y_continuous(labels=percent,name="% Wanting the Feature") + theme_wsj() | |
# It sucks, but I imported yet another version of the casino dataset so that I wouldn't have to use | |
# the annoying ffdf indexing notation (e.g. df[,"variable1"]) | |
casino.orig2 = read.csv("/home/inkhorn/Downloads/casino_survey_results20130325.csv") | |
# Finally, here's some code where I processed and plotted the Gender and Age demographic variables | |
casino$Gender = casino.orig$Gender | |
casino$Gender = ifelse(!(casino.orig2$Gender %in% c("Female","Male","Transgendered")), "Did not disclose", | |
ifelse(casino.orig2$Gender == "Female","Female", | |
ifelse(casino.orig2$Gender == "Male","Male","Transgendered"))) | |
casino$Gender = factor(casino$Gender, levels=c("Transgendered","Did not disclose","Female","Male")) | |
ggplot(casino, aes(x=Gender,y=..count../sum(..count..))) + geom_bar(fill="forest green") + coord_flip() + ggtitle("Gender Distribution of Respondents") + scale_x_discrete(name="") + theme_wsj() + theme(title=element_text(size=22),plot.title=element_text(hjust=.8)) + stat_bin(aes(label = sprintf("%.02f %%", ..count../sum(..count..)*100)), | |
geom="text") + scale_y_continuous(labels=percent) | |
casino$Age = ifelse(casino.orig2$Age == "", "Did not disclose", | |
ifelse(casino.orig2$Age == "Under 15", "Under 15", | |
ifelse(casino.orig2$Age == "15-24", "15-24", | |
ifelse(casino.orig2$Age == "25-34", "25-34", | |
ifelse(casino.orig2$Age == "35-44", "35-44", | |
ifelse(casino.orig2$Age == "45-54","45-54", | |
ifelse(casino.orig2$Age == "55-64","55-64", | |
ifelse(casino.orig2$Age == "65 or older", "65 or older","Did not disclose")))))))) | |
casino$Age = factor(casino$Age, levels=c("Did not disclose","Under 15","15-24","25-34","35-44","45-54","55-64","65 or older")) | |
ggplot(casino, aes(x=Age,y=..count../sum(..count..))) + geom_bar(fill="forest green") + coord_flip() + ggtitle("Age Distribution of Respondents") + scale_x_discrete(name="") + theme_wsj() + theme(title=element_text(size=22),plot.title=element_text(hjust=.8)) + stat_bin(aes(label = sprintf("%.02f %%", ..count../sum(..count..)*100)), geom="text") + scale_y_continuous(labels=percent) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment