Last active
December 19, 2016 08:03
-
-
Save benjaminrobinson/22eb9ad99ef4b9a952fe1060c1f49393 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#LINK TO DATA SHEETS HERE: http://dchr.dc.gov/public-employee-salary-information | |
#USE TABULIZER | |
options(stringsAsFactors = FALSE) | |
options(scipen = 10) | |
library(devtools) | |
install_github("ropenscilabs/tabulizerjars") | |
install_github("ropenscilabs/tabulizer") | |
library(tabulizer) | |
library(rvest) | |
library(dplyr) | |
#AUTOMATE SELECTION OF PDF LINKS? USING RVEST | |
urls <- read_html("http://dchr.dc.gov/public-employee-salary-information") %>% | |
html_nodes("a") %>% | |
html_attr("href") | |
#REMOVE NON-PDF LINKS | |
urls <- urls[grep("pdf",urls)] | |
#MOVE MOST RECENT PDF TO FIRST IN THE COUNT | |
urls <- c(urls[20],urls[1:19]) | |
#CREATE FUNCTION TO PROVIDE QUARTERLY DATES FOR DATA SUBMISSIONS | |
disclose_date <- function(year,quarter){ | |
x <- as.Date( | |
paste0(year, | |
ifelse(quarter==1,"-03-31", | |
ifelse(quarter==2,"-06-30", | |
ifelse(quarter==3,"-09-30","-12-31"))))) | |
return(x) | |
} | |
#GET DATES FROM 2011 TO 2016 | |
dates <- lapply(2016:2011,disclose_date,4:1) | |
#SUBSET OUT ALL BUT 4TH QUARTER FOR 2011 AND KEEP ALL BUT 4TH QTR FOR 2016 FOR NOW | |
dates[[6]] <- dates[[6]][[1]] | |
dates[[1]] <- subset(dates[[1]],!(dates[[1]] %in% dates[[1]][[1]])) | |
dates <- unlist(dates) | |
#CREATE META DATA OF LINKS AND URLS | |
meta <- cbind(dates,urls) %>% | |
as.data.frame %>% | |
mutate(dates=as.Date(as.numeric(dates),"1970-01-01"),dates=as.character(dates)) | |
rm(urls,dates) | |
#TEST TO SEE IF THE FIRST PAGE OF PDF SCRAPING FAILS... | |
test_page <- function(x) { | |
if(nrow(extract_tables(meta$urls[x],pages=1,method="data.frame")[[1]])==0){ | |
return("Failed") | |
stop() | |
} else { | |
print("Passed") | |
} | |
} | |
#RUNNING THE DATA TAKES A DECENT AMOUNT OF TIME | |
#PARSE PDF! TIME IT TOO! | |
#x= | |
parse_pdf <- function(x) { | |
begin <- Sys.time() | |
if(test_page(x)=="Passed"){ | |
dchr <- extract_tables(meta$urls[x],pages=1:get_n_pages(meta$urls[x]),method="data.frame") | |
} else { | |
dchr <- extract_tables(meta$urls[x],pages=2:get_n_pages(meta$urls[x]),method="data.frame") | |
} | |
print(Sys.time()-begin) | |
return(cbind(dchr,Reporting.Date=meta$dates[x])) | |
} | |
##ONLY FIRST SHEET HAS VARIABLE NAMES. OTHER SHEETS READ THE FIRST ROW AS VARIABLE NAMES WHEN THEY ARE NOT. | |
for(a in 2:length(dchr)){ | |
dchr[[a]] <- rbind(dchr[[a]],names(dchr[[a]])) | |
names(dchr[[a]]) <- names(dchr[[1]]) | |
} | |
salary <- do.call(rbind,dchr) %>% | |
mutate(Export.Date=meta$dates[x], | |
Hire.Date=as.Date(Hire.Date,"%m/%d/%Y"), | |
Annual.Rate=sub("([.-])|[[:punct:]]", "\\1",Annual.Rate), | |
Annual.Rate=as.numeric(Annual.Rate)) %>% | |
sapply(.,trimws) %>% | |
as.data.frame |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment