Skip to content

Instantly share code, notes, and snippets.

@milesgrimshaw
Created March 17, 2015 14:42
Show Gist options
  • Save milesgrimshaw/477b32e661518de9b6c2 to your computer and use it in GitHub Desktop.
Save milesgrimshaw/477b32e661518de9b6c2 to your computer and use it in GitHub Desktop.
Script to parse MindBody data
# Load libraries
library(ggplot2)
# Set working directory
setwd("~/Dropbox (Personal)/Personal/Github/MindBody/New/")
# Read in the data
data <- read.csv('mind_new.csv', header=TRUE, as.is=TRUE)
# Convert everything to lower case
data$Country <- tolower(data$Country)
data$City <- tolower(data$City)
data$Name <- tolower(data$Name)
# Eliminate duplicates
data$test <- paste(data$Name,data$Lon,data$Lat, sep="")
new <- data[which(!duplicated(data$test)),]
data <- new
# Descriptive exploration
nrow(data)
length(unique(data$Country))
length(unique(data$City))
# Country breakdown
country <- as.data.frame(unique(data$Country))
names(country) <- "Country"
country$Count <- sapply(country$Country, function (i) length(which(data$Country==i)))
country <- country[order(country$Count, decreasing=F),]
summary(country$Count)
country_top <- country[which(country$Count > 10),]
country <- transform(country, Country=reorder(Country, Count) )
country_top <- transform(country_top, Country=reorder(Country, Count) )
# Plot the countries
countryplot <- ggplot(country, aes( y=Count,x=Country)) + geom_bar(stat = "identity") + coord_flip() +
ggtitle("MindBodyOnline Clients by Country")
countryplot
countryplot <- ggplot(country_top, aes( y=Count,x=Country)) + geom_bar(stat = "identity") + coord_flip() +
ggtitle("MindBodyOnline Clients by Country (10+ Customers)")
countryplot
# Rename industries to be easy to read
data$Industry[which(data$Industry == 'Children&#39;s Programs')] <- 'Child Programs'
# Create levels for each unique industry
data$Industry <- factor(data$Industry)
levels(data$Industry)
# Select only US businesses
US <- data[which(data$Country=="united states"),]
# City breakdown across all countries
city <- as.data.frame(unique(data$City))
names(city) <- "City"
city$Count <- sapply(city$City, function (i) length(which(data$City==i)))
city_top <- city[which(city$Count > 50),]
city <- transform(city, City=reorder(City, Count) )
city_top <- transform(city_top, City=reorder(City, Count) )
# Plot the top cities
cityplot <- ggplot(city_top, aes( y=Count,x=City)) + geom_bar(stat = "identity") + coord_flip() +
ggtitle("MindBodyOnline Clients by City (50+ Customers)")
cityplot
# Create a data frame of count by industry tags
all <- as.data.frame(unique(data$Industry))
names(all) <- "Industry"
all$Count <- sapply(all$Industry, function (i) length(which(data$Industry==i)))
all <- all[order(all$Count, decreasing=F),]
all <- transform(all, Industry=reorder(Industry, Count) )
# Data frame by industry just for the U.S.
us <- as.data.frame(unique(US$Industry))
names(us) <- "Industry"
us$Count <- sapply(us$Industry, function (i) length(which(US$Industry==i)))
us <- us[order(us$Count, decreasing=F),]
us <- transform(us, Industry=reorder(Industry, Count) )
# Data frame by industry just for the NYC
NYC <- US[which(US$City=="new york"),]
nyc <- as.data.frame(unique(NYC$Industry))
names(nyc) <- "Industry"
nyc$Count <- sapply(nyc$Industry, function (i) length(which(NYC$Industry==i)))
nyc <- nyc[order(nyc$Count, decreasing=F),]
nyc <- transform(nyc, Industry=reorder(Industry, Count) )
# Plot the industries
allplot <- ggplot(all, aes( y=Count,x=Industry)) + geom_bar(stat = "identity") + coord_flip() +
ggtitle("MindBodyOnline Clients by Segment")
allplot
usplot <- ggplot(us, aes( y=Count,x=Industry)) + geom_bar(stat = "identity") + coord_flip() +
ggtitle("MindBodyOnline Clients by Segment in the U.S.")
usplot
nycplot <- ggplot(nyc, aes( y=Count,x=Industry)) + geom_bar(stat = "identity") + coord_flip() +
ggtitle("MindBodyOnline Clients by Segment in New York City")
nycplot
# How many of different names
length(grep('crossfit',data$Name))
length(grep('crossfit',US$Name))
length(grep('zumba',data$Name))
length(grep('yoga',data$Name))
length(grep('bikram',data$Name))
length(grep('bikram',US$Name))
# Save the plots
pdf(file="MindBody Countries.pdf",width=11,height=8.5)
par(las=2)
par(mar=c(5,8,4,2))
countryplot
dev.off()
pdf(file="MindBody Cities.pdf",width=11,height=8.5)
par(las=2)
par(mar=c(5,8,4,2))
cityplot
dev.off()
pdf(file="MindBody Industry Segments.pdf",width=11,height=8.5)
par(las=2)
par(mar=c(5,8,4,2))
allplot
dev.off()
pdf(file="MindBody Industry Segments US.pdf",width=11,height=8.5)
par(las=2)
par(mar=c(5,8,4,2))
usplot
dev.off()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment