Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save alekrutkowski/dc8980653a0e1f1b5243ea5f2757719a to your computer and use it in GitHub Desktop.
Save alekrutkowski/dc8980653a0e1f1b5243ea5f2757719a to your computer and use it in GitHub Desktop.
Web scraping Polish Parliament (Sejm), 8th and 9th term, years 2018–2019 (R script)
# just a stub to change the title of the gist
## 8th term of the Polish Parliament (Sejm)
library(rvest)
library(data.table)
`%>%` <- fastpipe::`%>>%`
tab <- openxlsx::read.xlsx(
'Sejm VIII kadencji - uchwaly.xlsx'
,startRow=2) %>%
as.data.table(.) %>%
.[, Data := as.Date(Data,origin="1899-12-30")] %>%
.[, Numb := stringi::stri_extract_first_words(Nr.druku)]
urls <- paste0('http://www.sejm.gov.pl/Sejm8.nsf/PrzebiegProc.xsp?nr='
,tab$Numb)
get_BillTitle_IntroDate <- function(table_row_num,url) {
message('Scraping url: ',url)
d. <- url %>%
html(.) %>%
data.table(
BillTitle = html_node(.,'.proces-legislacyjny .h2') %>% html_text(.)
,IntroDate = html_node(.,'.proces-legislacyjny ul.proces li span') %>% html_text(.))
saveRDS(d.,paste0('raw_data/',table_row_num,'.Rds'))
d.
}
dt_BillTitle_IntroDate <-
mapply(get_BillTitle_IntroDate, table_row_num=tab$Lp., url=urls
, SIMPLIFY=FALSE) %>%
rbindlist(.) %>%
{saveRDS(.,'dt_BillTitle_IntroDate.Rds'); .}
`PLmonth->ENmonth` <- function(txt)
txt %>%
sub('stycznia','Jan',.) %>%
sub('lutego','Feb',.) %>%
sub('marca','Mar',.) %>%
sub('kwietnia','Apr',.) %>%
sub('maja','May',.) %>%
sub('czerwca','Jun',.) %>%
sub('lipca','Jul',.) %>%
sub('sierpnia','Aug',.) %>%
sub('wrze.nia','Sep',.) %>%
sub('pa.dziernika','Oct',.) %>%
sub('listopada','Nov',.) %>%
sub('grudnia','Dec',.)
tab_with_BillTitle_IntroDate <-
tab %>%
cbind(.,dt_BillTitle_IntroDate) %>%
.[, IntroDate := `PLmonth->ENmonth`(IntroDate) %>% as.Date(.,'%d %b %Y')] %>%
.[, ComesFromGovernment := grepl('^Rz.dowy.*',BillTitle)] %>%
{openxlsx::write.xlsx(.,'tab_with_BillTitle_IntroDate_raw.xlsx'); .} %>%
setnames(., 'Data', 'AdoptionDate') %>%
.[, .(IntroDate,AdoptionDate,ComesFromGovernment,BillTitle)] %>%
{openxlsx::write.xlsx(.,'PL 8th parliament term, years 2018-2019.xlsx'); .}
## 9th term of the Polish Parliament (Sejm)
library(rvest)
library(data.table)
`%>%` <- fastpipe::`%>>%`
tab <- openxlsx::read.xlsx(
'Sejm IX kadencji - uchwaly.xlsx'
,startRow=2) %>%
as.data.table(.) %>%
.[, Data := as.Date(Data,origin="1899-12-30")] %>%
.[, Numb := stringi::stri_extract_first_words(Nr.druku)]
urls <- paste0('http://www.sejm.gov.pl/Sejm9.nsf/PrzebiegProc.xsp?nr='
,tab$Numb)
get_BillTitle_IntroDate <- function(table_row_num,url) {
message('Scraping url: ',url)
d. <- url %>%
html(.) %>%
data.table(
BillTitle = html_node(.,'.proces-legislacyjny .h2') %>% html_text(.)
,IntroDate = html_node(.,'.proces-legislacyjny ul.proces li span') %>% html_text(.))
saveRDS(d.,paste0('raw_data_9/',table_row_num,'.Rds'))
d.
}
dt_BillTitle_IntroDate <-
mapply(get_BillTitle_IntroDate, table_row_num=tab$Lp., url=urls
, SIMPLIFY=FALSE) %>%
rbindlist(.) %>%
{saveRDS(.,'dt_BillTitle_IntroDate_9.Rds'); .}
`PLmonth->ENmonth` <- function(txt)
txt %>%
sub('stycznia','Jan',.) %>%
sub('lutego','Feb',.) %>%
sub('marca','Mar',.) %>%
sub('kwietnia','Apr',.) %>%
sub('maja','May',.) %>%
sub('czerwca','Jun',.) %>%
sub('lipca','Jul',.) %>%
sub('sierpnia','Aug',.) %>%
sub('wrze.nia','Sep',.) %>%
sub('pa.dziernika','Oct',.) %>%
sub('listopada','Nov',.) %>%
sub('grudnia','Dec',.)
tab_with_BillTitle_IntroDate <-
tab %>%
cbind(.,dt_BillTitle_IntroDate) %>%
.[, IntroDate := `PLmonth->ENmonth`(IntroDate) %>% as.Date(.,'%d %b %Y')] %>%
.[, ComesFromGovernment := grepl('^Rz.dowy.*',BillTitle)] %>%
{openxlsx::write.xlsx(.,'tab_with_BillTitle_IntroDate_raw_9.xlsx'); .} %>%
setnames(., 'Data', 'AdoptionDate') %>%
.[, .(IntroDate,AdoptionDate,ComesFromGovernment,BillTitle)] %>%
{openxlsx::write.xlsx(.,'PL 9th parliament term, year 2019.xlsx'); .}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment