Skip to content

Instantly share code, notes, and snippets.

@stephlocke
Last active August 20, 2018 14:54
Show Gist options
  • Save stephlocke/e5e41dae6a5ea6850a002128a206e09c to your computer and use it in GitHub Desktop.
Save stephlocke/e5e41dae6a5ea6850a002128a206e09c to your computer and use it in GitHub Desktop.
Latest R budget scrape plus charts and figures
## ----message=FALSE-------------------------------------------------------
library(tabulizer)
library(tidyverse)
library(zoo)
library(tidytext)
## ------------------------------------------------------------------------
"http://www.pass.org/Portals/0/Governance%202017/2018%20Budget.pdf?ver=2017-10-26-171625-747&timestamp=1509063392150" %>%
tabulizer::extract_text() ->
rawtxt
str_trunc(rawtxt, 1000)
## ------------------------------------------------------------------------
rawtxt %>%
tokenize(tokenizer = tokenizer_line()) %>%
head()
## ------------------------------------------------------------------------
rawtxt %>%
tokenize(tokenizer = tokenizer_line()) %>%
str_replace_all("\\s+", " ") %>%
str_trim(side = "both") %>%
str_split(" ") %>%
head()
## ------------------------------------------------------------------------
combineLHS <- function(x) {
n <- length(x)
if (str_detect(x[n],"\\$$")&n>=3) {
data_frame(lineItem=paste(x[1:(n - 2)], collapse = " "),
b2017=x[n - 1],
b2018=x[n])
} else {
data_frame(lineItem=paste(x, collapse = " "),
b2017="",
b2018="")
}
}
## ------------------------------------------------------------------------
rawtxt %>%
tokenize(tokenizer = tokenizer_line()) %>%
str_replace_all("\\s+", " ") %>%
str_trim(side = "both") %>%
str_split(" ") %>%
map_df(combineLHS) ->
rawdata
rawdata %>%
head()
## ------------------------------------------------------------------------
moneycleaner <- function(x) {
x %>%
str_replace_all("[$,]|[[:space:]]", "") %>%
# Handle $0 amounts
str_replace_all("^-$", "0") %>%
# Handle negative amounts
ifelse(str_detect(., "\\("),
paste0("-", str_replace_all(., "[()]", "")),
.) %>%
as.numeric()
}
## ------------------------------------------------------------------------
rawdata %>%
mutate(
## Count the switches between table types
TblChange = cumsum(lineItem %in%
c("REVENUE", "EXPENSE")),
## Check if row is a total line for a table
Total = str_detect(lineItem, "^TOTAL"),
## Check if row is a total based on two tables
Net = str_detect(lineItem, "^NET"),
## Work out if the row contains line items or titles
Title = b2017 == "" & b2018 == "" ,
## Use MOD2 to determine which table type a row belonged to
Type = ifelse(TblChange %% 2 == 0, "Expense", "Revenue"),
## Identify the department if the row has budget 2018
Dept = ifelse(str_detect(lineItem,"Budget 2018"),
str_replace(str_extract(lineItem, "^.+-")," -",""),
NA),
## We're only interested in actual line items
IgnoreRow=Total|Net|Title
) %>%
mutate(
## Use `zoo`s rolling functions to carry the last non-NA value forward. Essentially doign a fill-down between values
Dept = zoo::na.locf(Dept, na.rm = FALSE),
b2017=moneycleaner(b2017),
b2018=moneycleaner(b2018)) %>%
mutate(
## Invert the sign of expenses
b2017=ifelse(Type=="Expense",-1,1)*b2017,
b2018=ifelse(Type=="Expense",-1,1)*b2018
) ->
alldata
## ----echo=FALSE----------------------------------------------------------
alldata %>% head()
## ------------------------------------------------------------------------
alldata %>%
filter(!IgnoreRow)->
flaggeddata
## ------------------------------------------------------------------------
flaggeddata %>%
filter(TblChange <= 2) %>%
select(
## Remove extraneous rows
-(TblChange:Title ), -IgnoreRow
)->
summarydata
flaggeddata %>%
filter(TblChange > 2) %>%
select(
## Remove extraneous rows
-(TblChange:Title ), -IgnoreRow
) ->
detaildata
## ----echo=FALSE----------------------------------------------------------
detaildata %>% head()
detaildata %>%
filter(Type == "Revenue") %>%
group_by(Dept) %>%
summarise(b2018=sum(b2018)) %>%
arrange(b2018) %>%
mutate(Dept=fct_inorder(Dept)) %>%
ggplot() +
aes(x=Dept, y=b2018, label=scales::dollar(b2018)) +
geom_col() +
coord_flip() +
scale_y_continuous(labels=scales::dollar) +
geom_text(aes(y= max(b2018)*1.25) ) +
ggthemes::theme_fivethirtyeight() +
labs(title = "Revenue by department")
detaildata %>%
filter(Type == "Revenue") %>%
mutate(Summit = str_detect(Dept, "[Ss]ummit")) %>%
group_by(Summit) %>%
summarise(b2018=sum(b2018)) %>%
mutate(prop=b2018/sum(b2018))
detaildata %>%
filter(Type == "Expense") %>%
group_by(Dept) %>%
summarise(b2018=sum(-b2018)) %>%
arrange(b2018) %>%
mutate(Dept=fct_inorder(Dept)) %>%
ggplot() +
aes(x=Dept, y=b2018, label=scales::dollar(b2018)) +
geom_col() +
coord_flip() +
scale_y_continuous(labels=scales::dollar) +
geom_text(aes(y= max(b2018)*1.25) ) +
ggthemes::theme_fivethirtyeight() +
labs(title = "Expenses by department", subtitle="Expressed as positive numbers for ease of visualising")
detaildata %>%
filter(Type == "Expense") %>%
mutate(Summit = str_detect(Dept, "[Ss]ummit")) %>%
group_by(Summit) %>%
summarise(b2018=sum(b2018)/50000) %>%
mutate(prop=b2018/sum(b2018),
sum(b2018))
detaildata %>%
filter(Type == "Expense") %>%
mutate(Community = str_detect(Dept, "[Ss]ummit|Local|SQLSaturday|Virtual|Content|Volunteer")) %>%
group_by(Community) %>%
summarise(b2018=sum(b2018)) %>%
mutate(prop=b2018/sum(b2018),
sum(b2018))
detaildata %>%
filter(Type == "Expense" &
!str_detect(Dept,
50)) %>%
mutate(Management = str_detect(lineItem, "^Management")) %>%
group_by(Management) %>%
summarise(b2018=sum(b2018)) %>%
mutate(prop=b2018/sum(b2018),
sum(b2018))
detaildata %>%
filter(Type == "Expense" ) %>%
mutate(Summit = str_detect(Dept, "[sS]ummit"),
Community = str_detect(Dept, "Local|SQLSaturday|Virtual|Content|Volunteer"),
Management = str_detect(lineItem, "^Management")) %>%
mutate(Type=case_when(Summit ~ "Summit",
Community ~ "Community",
Management ~ "Management",
TRUE ~ "Other")) %>%
group_by(Type) %>%
summarise(b2018=sum(-b2018)/50000) %>%
arrange(b2018) %>%
mutate(Type=fct_inorder(Type)) %>%
ggplot() +
aes(x= Type, y= b2018, label=scales::dollar(b2018)) +
geom_col() +
coord_flip() +
scale_y_continuous(labels=scales::dollar) +
geom_text(aes(y= max(b2018)*1.25) ) +
ggthemes::theme_fivethirtyeight() +
labs(title = "Expenses by category", subtitle="Expressed as positive numbers for ease of visualising")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment