Skip to content

Instantly share code, notes, and snippets.

@jdavidson
Created February 3, 2014 16:19
Show Gist options
  • Save jdavidson/8786950 to your computer and use it in GitHub Desktop.
Save jdavidson/8786950 to your computer and use it in GitHub Desktop.
Regional analysis of Crunchbase data
library(ggplot2)
library(ggthemes)
library(dplyr)
library(lubridate)
library(scales)
library(data.table)
library(reshape2)
options(scipen=999)
options(stringsAsFactors = FALSE)
# fix strange difference in units from diff
my.diff <- function(x, lag=1) {
n <- length(x)
round(difftime(x[(1+lag):n], x[1:(n-lag)], units="days") / 30)
}
rounds <- read.csv("2014-01-06-crunchbase_monthly_export_rounds.csv")
# dedup
rounds <- data.table(rounds)
setkeyv(rounds, c("company_name", "funded_at", "funding_round_type"))
rounds <- unique(rounds)
rounds <- subset(rounds, funded_month != "1960-01")
# fix strange date data
rounds$funded_at <- ymd(paste(rounds$funded_month, "01", sep="-"))
rounds <- arrange(rounds, funded_at)
rounds <- rounds[, id := seq_along(funded_at), by=company_name]
rounds <- rounds[, diff := c(my.diff(funded_at), NA), by=company_name]
# clean up rounds
rounds <- filter(rounds, company_country_code == "USA", company_state_code != "")
rounds$company_region <- toupper(gsub(" - Other", "", rounds$company_region))
rounds$company_city <- toupper(gsub("[^[:alnum:]///' ]", "", rounds$company_city))
rounds[company_region == "SF BAY"]$company_state_code <- "CA"
rounds[company_region == "NEW YORK"]$company_state_code <- "NY"
rounds[company_region == "LOS ANGELES"]$company_state_code <- "CA"
rounds <- filter(rounds, !company_region %in% c("UNKNOWN", "TBD"))
rounds$geocode <- paste(rounds$company_region, rounds$company_state_code, rounds$company_country_code)
categories <- read.csv("categories.csv")
names(categories)[1] <- c("company_category_code")
rounds <- data.table(inner_join(rounds, select(categories, company_category_code, broad_category)))
rounds <- filter(rounds, broad_category %in% c("enterprise", "consumer"))
regional_success <- filter(rounds, funded_at > ymd("2005-01-01")) %.% group_by(company_region, year=year(funded_at)) %.% summarise(companies=n(), follow_on=sum(!is.na(diff))) %.% arrange(desc(companies))
rplot <- ggplot(filter(regional_success, company_region %in% c("SF BAY", "NEW YORK", "BOSTON", "LOS ANGELES"), year < 2014), aes(x=as.factor(year), y=follow_on / companies, fill= company_region)) + geom_bar(stat="identity", position="dodge") + ylab("Follow On Rate") + xlab("Year") + ggtitle("Follow On Rate by Region") + theme(legend.position=c(.9,.8),legend.key = element_rect(fill=alpha("white", .2)), legend.background = element_rect(fill=alpha("white", .9)))
ggsave("region-follow-on.png", rplot, width=640 / 72, height=400 / 72, dpi=72)
rounds$sf <- rounds$company_city %in% c("SAN FRANCISCO", "SAN FRANCISO", "SN FRANCISCO", "SAN FRANCSICO")
bay_area_success <- filter(rounds, company_region == "SF BAY", funded_at > ymd("2005-01-01")) %.% group_by(sf, year=year(funded_at)) %.% summarise(companies=n(), follow_on=sum(!is.na(diff))) %.% arrange(year)
bay_area_success$follow_on_rate <- bay_area_success$follow_on / bay_area_success$companies
sfplot <- ggplot(filter(bay_area_success, year < 2014), aes(x=as.factor(year), y=follow_on / companies, fill=sf)) + geom_bar(stat="identity", position="dodge") + ylab("Follow On Rate") + xlab("Year") + ggtitle("Follow On Rate in SF Bay Area") + scale_fill_discrete(labels=c("Bay Area", "SF")) + theme(legend.position=c(.9,.8),legend.key = element_rect(fill=alpha("white", .2)), legend.background = element_rect(fill=alpha("white", .9)), legend.title=element_blank())
ggsave("sf-follow-on.png", sfplot, width=640 / 72, height=400 / 72, dpi=72)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment