Last active
April 26, 2016 19:12
-
-
Save mavam/f1467d48d5cf8a460b57 to your computer and use it in GitHub Desktop.
California data breach analysis
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# California data breach analysis | |
# | |
# Author: Matthias Vallentin <[email protected]> | |
# Copyright (c) 2016 | |
# | |
# To reproduce, please contact me. | |
library(dplyr) | |
library(ggplot2) | |
library(lubridate) | |
library(rvest) | |
library(stringr) | |
library(tidyr) | |
# Helper function to extract a column from a HTML table. | |
column <- function(data, xpath) { | |
data %>% html_node(xpath = xpath) %>% html_text(trim = TRUE) | |
} | |
breach_html <- read_html("https://oag.ca.gov/ecrime/databreach/list") | |
breach_table <- breach_html %>% html_nodes("tbody tr") | |
# Institution names. | |
institution = breach_table %>% column("td[1]/a") | |
# Comma-separated list of breach dates. | |
breached <- breach_table %>% | |
column("td[2]") %>% | |
str_split(", ") | |
# Date when breach was reported. | |
reported = breach_table %>% column("td[3]") %>% mdy | |
# Repetition factor to account for multiple breaches per report. | |
inflate <- sapply(breached, length) | |
breaches <- data.frame( | |
institution = rep(institution, inflate), | |
breached = breached %>% unlist %>% mdy(quiet = TRUE), | |
reported = rep(reported, inflate) | |
) %>% tbl_df | |
# How many unknown breach dates? | |
unknown <- breaches %>% | |
transmute(na = is.na(breached)) %>% | |
filter(na == TRUE) %>% | |
summarize(n()) | |
message("Total reports: ", length(reported)) | |
message("Total breaches: ", nrow(breaches)) | |
message("Unkown breach dates: ", unknown, " (", | |
round(unknown / nrow(breaches) * 100), "%)") | |
# Cumulative breaches, by date reported. | |
plot_cum_rep <- breaches %>% | |
arrange(reported) %>% | |
mutate(cum = seq_along(reported)) %>% | |
ggplot(aes(x = reported, y = cum)) + | |
geom_line() + | |
xlab("Year") + ylab("Breaches") | |
breaches_by_year <- breaches %>% | |
transmute(Year = year(breached)) %>% | |
group_by(Year) %>% | |
summarize(Breaches = n()) | |
reports_by_year <- breaches %>% | |
transmute(Year = year(reported)) %>% | |
group_by(Year) %>% | |
summarize(Reports = n()) | |
# Number of breaches and reports by year (without unknown breaches above). | |
plot_by_year <- left_join(breaches_by_year, reports_by_year) %>% | |
gather(key, value, Breaches, Reports) %>% | |
ggplot(aes(x = Year, y = value, fill = key)) + | |
geom_bar(stat = "identity", position = "dodge") + | |
scale_fill_discrete(name = "") + | |
scale_x_continuous(breaks = 2007:2016, labels = str_pad(7:16, 2, pad = 0)) + | |
xlab("Year") + ylab("Count") | |
# Weekday of breaches/reports. | |
plot_wday <- breaches %>% | |
transmute(Breaches = wday(breached), Reports = wday(reported)) %>% | |
gather(key, value, Breaches, Reports) %>% | |
ggplot(aes(x = value, fill = key)) + | |
geom_bar(position = "dodge") + | |
scale_fill_discrete(name = "") + | |
scale_x_continuous(breaks = 1:7, | |
labels = c("Sun", "Mon", "Tue", "Wed", "Thu", "Fri", "Sat")) + | |
xlab("Weekday") + ylab("Count") | |
# Top-10 most-breached institutions. | |
plot_top10_breached <- breaches %>% | |
group_by(institution) %>% | |
summarize(n = n()) %>% | |
arrange(desc(n)) %>% | |
head(10) %>% | |
ggplot(aes(x = reorder(institution, -n), y = n)) + | |
geom_bar(stat = "identity") + | |
xlab("Institution") + ylab("Breaches") + | |
scale_x_discrete(labels = function(x) str_wrap(x, width = 10)) | |
# Top-10 most-breached institutions reports). | |
plot_top10_reported <- breaches %>% | |
# Exclude AXP/Discover because they also have to report merchant data loss | |
filter(!grepl("AXP|Discover", institution)) %>% | |
group_by(institution) %>% | |
summarize(n = n_distinct(reported)) %>% | |
arrange(desc(n)) %>% | |
head(10) %>% | |
ggplot(aes(x = reorder(institution, -n), y = n)) + | |
geom_bar(stat = "identity") + | |
xlab("Institution") + ylab("Reports") + | |
scale_x_discrete(labels = function(x) str_wrap(x, width = 10)) | |
responsiveness <- breaches %>% | |
mutate(days = (breached %--% reported) / ddays(1), year = year(reported)) | |
# Distribution of time from breach until report. | |
plot_resp_ecdf <- responsiveness %>% | |
ggplot(aes(x = days)) + | |
stat_ecdf() + | |
scale_x_log10(minor_breaks = NULL, | |
breaks = c(1, 7, 14, 30, 60, 90, 150, 365, 2 * 365, 3 * 365, 5 * 365), | |
labels = c("1d", "1w", "2w", "1m", "2m", "3m", "5m", "1y", "2y", "3y", "5y")) + | |
xlab("Responsiveness") + | |
ylab("ECDF") | |
# Same as above, but one ECDF per year. | |
plot_resp_ecdf_by_year <- responsiveness %>% | |
mutate(Year = factor(year)) %>% | |
ggplot(aes(x = days, group = Year, color = Year)) + | |
stat_ecdf() + | |
scale_x_log10(minor_breaks = NULL, | |
breaks = c(1, 7, 14, 30, 60, 90, 150, 365, 2 * 365, 3 * 365, 5 * 365), | |
labels = c("1d", "1w", "2w", "1m", "2m", "3m", "5m", "1y", "2y", "3y", "5y")) + | |
xlab("Responsiveness") + | |
ylab("ECDF") | |
# Did the industry get quicker at reporting? | |
plot_resp_bar_by_year <- responsiveness %>% | |
group_by(year) %>% | |
summarize(median = median(days, na.rm = TRUE)) %>% | |
ggplot(aes(x = year, y = median)) + | |
geom_bar(stat = "identity") + | |
xlab("Median responsiveness") + | |
ylab("Days") | |
# Top-10 best responsiveness. | |
plot_resp_best <- responsiveness %>% | |
group_by(institution) %>% | |
summarize(median = median(days, na.rm = TRUE)) %>% | |
arrange(median) %>% | |
head(10) %>% | |
ggplot(aes(x = reorder(institution, median), y = median)) + | |
geom_bar(stat = "identity") + | |
xlab("Institution") + ylab("Median Response Time (Days)") + | |
scale_x_discrete(labels = function(x) str_wrap(x, width = 10)) | |
# Top-10 worst responsiveness. | |
plot_resp_worst <- responsiveness %>% | |
mutate(years = days / 365) %>% | |
group_by(institution) %>% | |
summarize(median = median(years, na.rm = TRUE)) %>% | |
arrange(desc(median)) %>% | |
head(10) %>% | |
ggplot(aes(x = reorder(institution, -median), y = median)) + | |
geom_bar(stat = "identity") + | |
xlab("Institution") + ylab("Median Response Time (Years)") + | |
scale_x_discrete(labels = function(x) str_wrap(x, width = 10)) | |
ggsave("breaches-cumulative-reported.png", plot_cum_rep) | |
ggsave("breaches-by-year.png", plot_by_year) | |
ggsave("breaches-by-weekday.png", plot_wday) | |
ggsave("breaches-top10-most-breached.png", plot_top10_breached) | |
ggsave("breaches-top10-most-reported.png", plot_top10_reported) | |
ggsave("breaches-responsiveness-ecdf.png", plot_resp_ecdf) | |
ggsave("breaches-responsiveness-ecdf-by-year.png", plot_resp_ecdf_by_year) | |
ggsave("breaches-responsiveness-by-year-median.png", plot_resp_bar_by_year) | |
ggsave("breaches-responsiveness-best.png", plot_resp_best) | |
ggsave("breaches-responsiveness-worst.png", plot_resp_worst) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment