Last active
August 20, 2018 14:54
-
-
Save stephlocke/e5e41dae6a5ea6850a002128a206e09c to your computer and use it in GitHub Desktop.
Latest R budget scrape plus charts and figures
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
## ----message=FALSE------------------------------------------------------- | |
library(tabulizer) | |
library(tidyverse) | |
library(zoo) | |
library(tidytext) | |
## ------------------------------------------------------------------------ | |
"http://www.pass.org/Portals/0/Governance%202017/2018%20Budget.pdf?ver=2017-10-26-171625-747×tamp=1509063392150" %>% | |
tabulizer::extract_text() -> | |
rawtxt | |
str_trunc(rawtxt, 1000) | |
## ------------------------------------------------------------------------ | |
rawtxt %>% | |
tokenize(tokenizer = tokenizer_line()) %>% | |
head() | |
## ------------------------------------------------------------------------ | |
rawtxt %>% | |
tokenize(tokenizer = tokenizer_line()) %>% | |
str_replace_all("\\s+", " ") %>% | |
str_trim(side = "both") %>% | |
str_split(" ") %>% | |
head() | |
## ------------------------------------------------------------------------ | |
combineLHS <- function(x) { | |
n <- length(x) | |
if (str_detect(x[n],"\\$$")&n>=3) { | |
data_frame(lineItem=paste(x[1:(n - 2)], collapse = " "), | |
b2017=x[n - 1], | |
b2018=x[n]) | |
} else { | |
data_frame(lineItem=paste(x, collapse = " "), | |
b2017="", | |
b2018="") | |
} | |
} | |
## ------------------------------------------------------------------------ | |
rawtxt %>% | |
tokenize(tokenizer = tokenizer_line()) %>% | |
str_replace_all("\\s+", " ") %>% | |
str_trim(side = "both") %>% | |
str_split(" ") %>% | |
map_df(combineLHS) -> | |
rawdata | |
rawdata %>% | |
head() | |
## ------------------------------------------------------------------------ | |
moneycleaner <- function(x) { | |
x %>% | |
str_replace_all("[$,]|[[:space:]]", "") %>% | |
# Handle $0 amounts | |
str_replace_all("^-$", "0") %>% | |
# Handle negative amounts | |
ifelse(str_detect(., "\\("), | |
paste0("-", str_replace_all(., "[()]", "")), | |
.) %>% | |
as.numeric() | |
} | |
## ------------------------------------------------------------------------ | |
rawdata %>% | |
mutate( | |
## Count the switches between table types | |
TblChange = cumsum(lineItem %in% | |
c("REVENUE", "EXPENSE")), | |
## Check if row is a total line for a table | |
Total = str_detect(lineItem, "^TOTAL"), | |
## Check if row is a total based on two tables | |
Net = str_detect(lineItem, "^NET"), | |
## Work out if the row contains line items or titles | |
Title = b2017 == "" & b2018 == "" , | |
## Use MOD2 to determine which table type a row belonged to | |
Type = ifelse(TblChange %% 2 == 0, "Expense", "Revenue"), | |
## Identify the department if the row has budget 2018 | |
Dept = ifelse(str_detect(lineItem,"Budget 2018"), | |
str_replace(str_extract(lineItem, "^.+-")," -",""), | |
NA), | |
## We're only interested in actual line items | |
IgnoreRow=Total|Net|Title | |
) %>% | |
mutate( | |
## Use `zoo`s rolling functions to carry the last non-NA value forward. Essentially doign a fill-down between values | |
Dept = zoo::na.locf(Dept, na.rm = FALSE), | |
b2017=moneycleaner(b2017), | |
b2018=moneycleaner(b2018)) %>% | |
mutate( | |
## Invert the sign of expenses | |
b2017=ifelse(Type=="Expense",-1,1)*b2017, | |
b2018=ifelse(Type=="Expense",-1,1)*b2018 | |
) -> | |
alldata | |
## ----echo=FALSE---------------------------------------------------------- | |
alldata %>% head() | |
## ------------------------------------------------------------------------ | |
alldata %>% | |
filter(!IgnoreRow)-> | |
flaggeddata | |
## ------------------------------------------------------------------------ | |
flaggeddata %>% | |
filter(TblChange <= 2) %>% | |
select( | |
## Remove extraneous rows | |
-(TblChange:Title ), -IgnoreRow | |
)-> | |
summarydata | |
flaggeddata %>% | |
filter(TblChange > 2) %>% | |
select( | |
## Remove extraneous rows | |
-(TblChange:Title ), -IgnoreRow | |
) -> | |
detaildata | |
## ----echo=FALSE---------------------------------------------------------- | |
detaildata %>% head() | |
detaildata %>% | |
filter(Type == "Revenue") %>% | |
group_by(Dept) %>% | |
summarise(b2018=sum(b2018)) %>% | |
arrange(b2018) %>% | |
mutate(Dept=fct_inorder(Dept)) %>% | |
ggplot() + | |
aes(x=Dept, y=b2018, label=scales::dollar(b2018)) + | |
geom_col() + | |
coord_flip() + | |
scale_y_continuous(labels=scales::dollar) + | |
geom_text(aes(y= max(b2018)*1.25) ) + | |
ggthemes::theme_fivethirtyeight() + | |
labs(title = "Revenue by department") | |
detaildata %>% | |
filter(Type == "Revenue") %>% | |
mutate(Summit = str_detect(Dept, "[Ss]ummit")) %>% | |
group_by(Summit) %>% | |
summarise(b2018=sum(b2018)) %>% | |
mutate(prop=b2018/sum(b2018)) | |
detaildata %>% | |
filter(Type == "Expense") %>% | |
group_by(Dept) %>% | |
summarise(b2018=sum(-b2018)) %>% | |
arrange(b2018) %>% | |
mutate(Dept=fct_inorder(Dept)) %>% | |
ggplot() + | |
aes(x=Dept, y=b2018, label=scales::dollar(b2018)) + | |
geom_col() + | |
coord_flip() + | |
scale_y_continuous(labels=scales::dollar) + | |
geom_text(aes(y= max(b2018)*1.25) ) + | |
ggthemes::theme_fivethirtyeight() + | |
labs(title = "Expenses by department", subtitle="Expressed as positive numbers for ease of visualising") | |
detaildata %>% | |
filter(Type == "Expense") %>% | |
mutate(Summit = str_detect(Dept, "[Ss]ummit")) %>% | |
group_by(Summit) %>% | |
summarise(b2018=sum(b2018)/50000) %>% | |
mutate(prop=b2018/sum(b2018), | |
sum(b2018)) | |
detaildata %>% | |
filter(Type == "Expense") %>% | |
mutate(Community = str_detect(Dept, "[Ss]ummit|Local|SQLSaturday|Virtual|Content|Volunteer")) %>% | |
group_by(Community) %>% | |
summarise(b2018=sum(b2018)) %>% | |
mutate(prop=b2018/sum(b2018), | |
sum(b2018)) | |
detaildata %>% | |
filter(Type == "Expense" & | |
!str_detect(Dept, | |
50)) %>% | |
mutate(Management = str_detect(lineItem, "^Management")) %>% | |
group_by(Management) %>% | |
summarise(b2018=sum(b2018)) %>% | |
mutate(prop=b2018/sum(b2018), | |
sum(b2018)) | |
detaildata %>% | |
filter(Type == "Expense" ) %>% | |
mutate(Summit = str_detect(Dept, "[sS]ummit"), | |
Community = str_detect(Dept, "Local|SQLSaturday|Virtual|Content|Volunteer"), | |
Management = str_detect(lineItem, "^Management")) %>% | |
mutate(Type=case_when(Summit ~ "Summit", | |
Community ~ "Community", | |
Management ~ "Management", | |
TRUE ~ "Other")) %>% | |
group_by(Type) %>% | |
summarise(b2018=sum(-b2018)/50000) %>% | |
arrange(b2018) %>% | |
mutate(Type=fct_inorder(Type)) %>% | |
ggplot() + | |
aes(x= Type, y= b2018, label=scales::dollar(b2018)) + | |
geom_col() + | |
coord_flip() + | |
scale_y_continuous(labels=scales::dollar) + | |
geom_text(aes(y= max(b2018)*1.25) ) + | |
ggthemes::theme_fivethirtyeight() + | |
labs(title = "Expenses by category", subtitle="Expressed as positive numbers for ease of visualising") | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment