Skip to content

Instantly share code, notes, and snippets.

@jdavidson
Created January 13, 2014 19:25
Show Gist options
  • Save jdavidson/8406379 to your computer and use it in GitHub Desktop.
Save jdavidson/8406379 to your computer and use it in GitHub Desktop.
An analysis of crunchbase data for start up financing timing.
library(ggplot2)
library(ggthemes)
library(plyr)
library(lubridate)
library(scales)
library(data.table)
options(scipen=999)
options(stringsAsFactors = FALSE)
rounds <- read.csv("2014-01-06-crunchbase_monthly_export_rounds.csv")
# exclude non venture rounds (other, private equity, post-ipo)
rounds <- subset(rounds, funding_round_type %in% c("venture", "angel", "series-a", "series-b"))# , "series-c+"))
rounds <- subset(rounds, funded_month != "1960-01")
# fix strange date data
rounds$funded_at <- ymd(paste(rounds$funded_month, "01", sep="-"))
rounds$round_raised_amount_usd <- cut(rounds$raised_amount_usd, breaks=c(0, 500000, 1000000, 5000000, 10000000, 20000000, 40000000, 80000000, Inf), right=FALSE)# round(rounds $raised_amount_usd / 500000) * 500000
# dedup
rounds <- data.table(rounds)
setkeyv(rounds, c("company_name", "funded_at", "funding_round_type"))
rounds <- unique(rounds)
# restrict to companies first funded after 2008
companies <- rounds[, list(first_funded_at = min(funded_at)), by = company_name]
rounds <- join(rounds, companies)
rounds <- subset(rounds, first_funded_at > ymd("2008-01-01"))
# fix strange difference in units from diff
my.diff <- function(x, lag=1) {
n <- length(x)
round(difftime(x[(1+lag):n], x[1:(n-lag)], units="days") / 30)
}
# round sequences
rounds_index <- rounds[, id := seq_along(funded_at), by=company_name]
rounds_index <- rounds_index[, diff := c(my.diff(funded_at), NA), by=company_name]
rounds_index$lifetime <- rounds_index$diff
rounds_index[is.na(rounds_index$lifetime),]$lifetime <- round(as.numeric(difftime(max(rounds_index$funded_at), rounds_index[is.na(rounds_index$lifetime),]$funded_at, units="days") / 30))
# rounds_index <- ddply(rounds, .(company_name), transform, index=seq_along(funded_at), diff=c(my.diff(funded_at), NA))
# aggregate
medians <- ddply(rounds_index, .(funding_round_type), summarize, rounds=length(id), median=median(diff, na.rm=T), mean=mean(diff, na.rm=T))
medians <- medians[order(medians$median),]
diff_summary <- rounds_index[, list(rounds = length(id)), by = c("funding_round_type", "diff")]
setnames(diff_summary, "diff", "lifetime")
diff_summary <- diff_summary[!is.na(diff_summary$lifetime),]
diff_summary <- diff_summary[order(funding_round_type, lifetime, decreasing=T),]
diff_summary <- diff_summary[, cum_rounds := cumsum(rounds), by= funding_round_type]
round_lifetimes <- rounds_index[, list(total_rounds = length(id)), by = c("funding_round_type", "lifetime")]
round_lifetimes <- round_lifetimes[order(funding_round_type, lifetime, decreasing=T),]
round_lifetimes <- round_lifetimes[, cum_total_rounds := cumsum(total_rounds), by= funding_round_type]
diff_summary <- join(diff_summary, round_lifetimes)
diff_summary <- diff_summary[order(funding_round_type, lifetime),]
diff_summary$percent <- diff_summary$rounds / diff_summary$cum_total_rounds
ggplot(diff_summary, aes(x=lifetime, y=rounds, color=funding_round_type)) + geom_point() + scale_x_continuous(breaks = 0:4 * 12, limits=c(0,48)) + geom_smooth() + ggtitle("Financings") + ylab("Financings") + xlab("Months After Funding") + scale_color_discrete(name = "Round")
ggplot(diff_summary, aes(x=lifetime, y= cum_rounds / cum_total_rounds, color=funding_round_type)) + geom_line() + scale_x_continuous(breaks = 0:4 * 12, limits=c(0,48)) + ggtitle("Likelihood Of Raising A Follow On Round By Time") + ylab("Percent of Companies that Raise a Follow On Round") + xlab("Months After Funding") + scale_y_continuous(labels = percent_format()) + scale_color_discrete(name = "Round")
ggsave("follow-on-likelihood-by-time.png")
#### round_raised_amount_usd
medians <- ddply(rounds_index, .(funding_round_type, round_raised_amount_usd), summarize, rounds=length(id), median=median(diff, na.rm=T), mean=mean(diff, na.rm=T))
medians[order(medians$funding_round_type, medians$median),]
diff_summary <- rounds_index[, list(rounds = length(id)), by = c("funding_round_type", "round_raised_amount_usd", "diff")]
setnames(diff_summary, "diff", "lifetime")
diff_summary <- diff_summary[!is.na(diff_summary$lifetime),]
diff_summary <- diff_summary[order(funding_round_type, round_raised_amount_usd, lifetime, decreasing=T),]
diff_summary <- diff_summary[, cum_rounds := cumsum(rounds), by= c("funding_round_type", "round_raised_amount_usd")]
round_lifetimes <- rounds_index[, list(total_rounds = length(id)), by = c("funding_round_type", "round_raised_amount_usd", "lifetime")]
round_lifetimes <- round_lifetimes[order(funding_round_type, round_raised_amount_usd, lifetime, decreasing=T),]
round_lifetimes <- round_lifetimes[, cum_total_rounds := cumsum(total_rounds), by=c("funding_round_type", "round_raised_amount_usd")]
diff_summary <- join(diff_summary, round_lifetimes)
diff_summary <- diff_summary[order(funding_round_type, round_raised_amount_usd, lifetime),]
diff_summary$percent <- diff_summary$rounds / diff_summary$cum_total_rounds
diff_summary <- diff_summary[!is.na(diff_summary$round_raised_amount_usd),]
ggplot(subset(diff_summary, funding_round_type == "angel"), aes(x=lifetime, y=rounds, color= as.factor(round_raised_amount_usd))) + geom_point() + xlim(0,48) + geom_smooth() + ggtitle("") + ylab("Rounds") + xlab("Months After Funding")
ggplot(subset(diff_summary, funding_round_type == "angel"), aes(x=lifetime, y= cum_rounds / cum_total_rounds, color= as.factor(round_raised_amount_usd))) + geom_line() + xlim(0,48) + ggtitle("") + ylab("Likelihood of a Follow On Round") + xlab("Months After Funding") + scale_y_continuous(labels = percent_format())
diff_summary$funding_round_type <- factor(diff_summary$funding_round_type, levels=c("angel", "venture", "series-a", "series-b"))
diff_summary$round_raised_amount_usd <- mapvalues(diff_summary$round_raised_amount_usd, from = c("[0,5e+05)", "[5e+05,1e+06)", "[1e+06,5e+06)", "[5e+06,1e+07)", "[1e+07,2e+07)", "[2e+07,4e+07)", "[4e+07,8e+07)", "[8e+07,Inf)"), to = c("$0-.5M", "$.5-1M", "$1-5M", "$5-10M", "$10-20M", "$20-40M", "$40-80M", "$80M+"))
ggplot(subset(diff_summary, cum_total_rounds > 20 & round_raised_amount_usd != "$80M+"), aes(x=lifetime, y= cum_rounds / cum_total_rounds, color=round_raised_amount_usd)) + geom_line() + xlim(0,48) + ggtitle("Follow On Likelihood by Round and Amount") + ylab("Likelihood of a Follow On Round") + xlab("Months After Funding") + scale_y_continuous(labels = percent_format()) + facet_wrap(~ funding_round_type) + scale_colour_few()
ggsave("follow-on-likelihood-by-round-size.png")
#### company_category_code
medians <- ddply(rounds_index, .(funding_round_type, company_category_code), summarize, rounds=length(id), median=median(diff, na.rm=T), mean=mean(diff, na.rm=T))
medians <- medians[order(medians$funding_round_type, medians$median),]
diff_summary <- rounds_index[, list(rounds = length(id)), by = c("funding_round_type", "company_category_code", "diff")]
setnames(diff_summary, "diff", "lifetime")
diff_summary <- diff_summary[!is.na(diff_summary$lifetime),]
diff_summary <- diff_summary[order(funding_round_type, company_category_code, lifetime, decreasing=T),]
diff_summary <- diff_summary[, cum_rounds := cumsum(rounds), by= c("funding_round_type", "company_category_code")]
round_lifetimes <- rounds_index[, list(total_rounds = length(id)), by = c("funding_round_type", "company_category_code", "lifetime")]
round_lifetimes <- round_lifetimes[order(funding_round_type, company_category_code, lifetime, decreasing=T),]
round_lifetimes <- round_lifetimes[, cum_total_rounds := cumsum(total_rounds), by=c("funding_round_type", "company_category_code")]
diff_summary <- join(diff_summary, round_lifetimes)
diff_summary <- diff_summary[order(funding_round_type, company_category_code, lifetime),]
diff_summary$percent <- diff_summary$rounds / diff_summary$cum_total_rounds
category_counts <- ddply(rounds, .(company_category_code), summarize, counts=length(unique(company_name)))
category_counts <- category_counts[order(category_counts$counts, decreasing=T),]
ggplot(subset(diff_summary, company_category_code %in% category_counts[1:9, "company_category_code"]), aes(x=lifetime, y=rounds, color=funding_round_type)) + geom_point() + xlim(0,48) + geom_smooth() + ggtitle("") + ylab("Rounds") + xlab("Months After Funding") + facet_wrap(~ company_category_code)
ggplot(subset(diff_summary, company_category_code %in% category_counts[1:9, "company_category_code"]), aes(x=lifetime, y= cum_rounds / cum_total_rounds, color= funding_round_type)) + geom_line() + xlim(0,48) + ggtitle("Follow On Likelihood by Category") + ylab("Likelihood of a Follow On Round") + xlab("Months After Funding") + scale_y_continuous(labels = percent_format()) + facet_wrap(~ company_category_code)
ggsave("follow-on-likelihood-by-round-category.png")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment