Created
March 17, 2015 14:42
-
-
Save milesgrimshaw/477b32e661518de9b6c2 to your computer and use it in GitHub Desktop.
Script to parse MindBody data
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Load libraries | |
library(ggplot2) | |
# Set working directory | |
setwd("~/Dropbox (Personal)/Personal/Github/MindBody/New/") | |
# Read in the data | |
data <- read.csv('mind_new.csv', header=TRUE, as.is=TRUE) | |
# Convert everything to lower case | |
data$Country <- tolower(data$Country) | |
data$City <- tolower(data$City) | |
data$Name <- tolower(data$Name) | |
# Eliminate duplicates | |
data$test <- paste(data$Name,data$Lon,data$Lat, sep="") | |
new <- data[which(!duplicated(data$test)),] | |
data <- new | |
# Descriptive exploration | |
nrow(data) | |
length(unique(data$Country)) | |
length(unique(data$City)) | |
# Country breakdown | |
country <- as.data.frame(unique(data$Country)) | |
names(country) <- "Country" | |
country$Count <- sapply(country$Country, function (i) length(which(data$Country==i))) | |
country <- country[order(country$Count, decreasing=F),] | |
summary(country$Count) | |
country_top <- country[which(country$Count > 10),] | |
country <- transform(country, Country=reorder(Country, Count) ) | |
country_top <- transform(country_top, Country=reorder(Country, Count) ) | |
# Plot the countries | |
countryplot <- ggplot(country, aes( y=Count,x=Country)) + geom_bar(stat = "identity") + coord_flip() + | |
ggtitle("MindBodyOnline Clients by Country") | |
countryplot | |
countryplot <- ggplot(country_top, aes( y=Count,x=Country)) + geom_bar(stat = "identity") + coord_flip() + | |
ggtitle("MindBodyOnline Clients by Country (10+ Customers)") | |
countryplot | |
# Rename industries to be easy to read | |
data$Industry[which(data$Industry == 'Children's Programs')] <- 'Child Programs' | |
# Create levels for each unique industry | |
data$Industry <- factor(data$Industry) | |
levels(data$Industry) | |
# Select only US businesses | |
US <- data[which(data$Country=="united states"),] | |
# City breakdown across all countries | |
city <- as.data.frame(unique(data$City)) | |
names(city) <- "City" | |
city$Count <- sapply(city$City, function (i) length(which(data$City==i))) | |
city_top <- city[which(city$Count > 50),] | |
city <- transform(city, City=reorder(City, Count) ) | |
city_top <- transform(city_top, City=reorder(City, Count) ) | |
# Plot the top cities | |
cityplot <- ggplot(city_top, aes( y=Count,x=City)) + geom_bar(stat = "identity") + coord_flip() + | |
ggtitle("MindBodyOnline Clients by City (50+ Customers)") | |
cityplot | |
# Create a data frame of count by industry tags | |
all <- as.data.frame(unique(data$Industry)) | |
names(all) <- "Industry" | |
all$Count <- sapply(all$Industry, function (i) length(which(data$Industry==i))) | |
all <- all[order(all$Count, decreasing=F),] | |
all <- transform(all, Industry=reorder(Industry, Count) ) | |
# Data frame by industry just for the U.S. | |
us <- as.data.frame(unique(US$Industry)) | |
names(us) <- "Industry" | |
us$Count <- sapply(us$Industry, function (i) length(which(US$Industry==i))) | |
us <- us[order(us$Count, decreasing=F),] | |
us <- transform(us, Industry=reorder(Industry, Count) ) | |
# Data frame by industry just for the NYC | |
NYC <- US[which(US$City=="new york"),] | |
nyc <- as.data.frame(unique(NYC$Industry)) | |
names(nyc) <- "Industry" | |
nyc$Count <- sapply(nyc$Industry, function (i) length(which(NYC$Industry==i))) | |
nyc <- nyc[order(nyc$Count, decreasing=F),] | |
nyc <- transform(nyc, Industry=reorder(Industry, Count) ) | |
# Plot the industries | |
allplot <- ggplot(all, aes( y=Count,x=Industry)) + geom_bar(stat = "identity") + coord_flip() + | |
ggtitle("MindBodyOnline Clients by Segment") | |
allplot | |
usplot <- ggplot(us, aes( y=Count,x=Industry)) + geom_bar(stat = "identity") + coord_flip() + | |
ggtitle("MindBodyOnline Clients by Segment in the U.S.") | |
usplot | |
nycplot <- ggplot(nyc, aes( y=Count,x=Industry)) + geom_bar(stat = "identity") + coord_flip() + | |
ggtitle("MindBodyOnline Clients by Segment in New York City") | |
nycplot | |
# How many of different names | |
length(grep('crossfit',data$Name)) | |
length(grep('crossfit',US$Name)) | |
length(grep('zumba',data$Name)) | |
length(grep('yoga',data$Name)) | |
length(grep('bikram',data$Name)) | |
length(grep('bikram',US$Name)) | |
# Save the plots | |
pdf(file="MindBody Countries.pdf",width=11,height=8.5) | |
par(las=2) | |
par(mar=c(5,8,4,2)) | |
countryplot | |
dev.off() | |
pdf(file="MindBody Cities.pdf",width=11,height=8.5) | |
par(las=2) | |
par(mar=c(5,8,4,2)) | |
cityplot | |
dev.off() | |
pdf(file="MindBody Industry Segments.pdf",width=11,height=8.5) | |
par(las=2) | |
par(mar=c(5,8,4,2)) | |
allplot | |
dev.off() | |
pdf(file="MindBody Industry Segments US.pdf",width=11,height=8.5) | |
par(las=2) | |
par(mar=c(5,8,4,2)) | |
usplot | |
dev.off() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment