Last active
October 7, 2016 16:11
-
-
Save briatte/df0f41f5ce97443e0a0a to your computer and use it in GitHub Desktop.
100-lines scraper for plenary statements by Members of the European Parliament — see briatte/euspeech for the full project
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| library(XML) | |
| library(jsonlite) | |
| library(plyr) | |
| dir.create("records") | |
| data = "meps.csv" | |
| if(!file.exists(data)) { | |
| html = "http://www.europarl.europa.eu/meps/en/directory.html?filter=all&leg=" | |
| html = htmlParse(html, encoding = "UTF-8") | |
| # index page | |
| root = function(x) paste0("//div[@class='zone_info_mep']/div[@class='mep_details']/ul/li", x) | |
| link = xpathSApply(html, root("[@class='mep_name']/a/@href")) | |
| name = xpathSApply(html, root("[@class='mep_name']")) | |
| name = sapply(name, xmlValue) | |
| # # MEPs currently in power (different page, ongoing legislature only) | |
| # natl = xpathSApply(html, root("[contains(@class, 'nationality')]/@class")) | |
| # natl = gsub("nationality ", "", natl) | |
| # group = xpathSApply(html, root("[contains(@class, 'group')]/@class")) | |
| # group = gsub("group ", "", group) | |
| # party = xpathSApply(html, root("[contains(@class, 'nationality')]/span")) | |
| # party = sapply(party, xmlValue) | |
| # party = gsub("\\\"", "", party) | |
| # # add memberships from individual MEP pages (ongoing legislature only) | |
| # member = sapply(link, function(x) { | |
| # print(x) | |
| # html = htmlParse(paste0("http://www.europarl.europa.eu/", x)) | |
| # root = "//ul[@class='events_collection']" | |
| # html = sapply(xpathSApply(html, paste0(root, "/*/acronym | ", root, "/*/*/acronym")), xmlValue) | |
| # return(paste0(html, collapse = ";")) | |
| # }) | |
| write.csv(data.frame(link, name), data) # , natl, party, group, member | |
| } | |
| data = read.csv(data, stringsAsFactors = FALSE) | |
| get_cre <- function(id, leg = 7, verbose = TRUE) { | |
| if(verbose) | |
| cat("\n", id, "legislature", leg) | |
| rec = data.frame() | |
| idx = 0 | |
| while(idx > -1) { | |
| if(verbose) | |
| cat(" ", idx, "...") | |
| x = paste0("http://www.europarl.europa.eu/meps/en/", id, | |
| "/see_more.html?type=CRE&leg=", leg, "&index=", idx) | |
| x = try(fromJSON(readLines(x, warn = FALSE), flatten = TRUE)) | |
| if("try-error" %in% class(x)) { | |
| warning("Scraper error: MEP ", id) | |
| } else { | |
| idx = x$nextIndex | |
| if(class(x$documentList) == "data.frame") | |
| rec = rbind(rec, cbind(leg, x$documentList)) | |
| if(!idx) | |
| idx = -1 | |
| } | |
| } | |
| return(rec) | |
| } | |
| sanitize <- function(x) return(ifelse(length(x) < 1, NA, paste0(x, collapse = ";"))) | |
| files = gsub("/meps/en/(\\d+)(.*)", "\\1", data$link) | |
| for(i in files) { | |
| file = paste0("records/", i, "_cre.csv") | |
| if(!file.exists(file)) { | |
| record = lapply(1:7, function(y) get_cre(i, y)) | |
| record = rbind.fill(record) | |
| if(length(record) > 0) { | |
| record = data.frame(id = i, record) | |
| record$formatList = sapply(record$formatList, sanitize) | |
| record$committeeList = sapply(record$committeeList, sanitize) | |
| record$voteExplanationList = sapply(record$voteExplanationList, sanitize) | |
| record = lapply(record, unlist) | |
| write.csv(record, file) | |
| } | |
| message(paste("Scraped: MEP", i, length(files) - which(files == i), "left")) | |
| } | |
| } | |
| # kthxbye |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| files = gsub("/meps/en/(\\d+)(.*)", "\\1", data$link) | |
| for(i in files) { | |
| file = paste0("records/", i, "_cre.csv") | |
| if(!file.exists(file)) { | |
| record = lapply(1:7, function(y) get_cre(i, y)) | |
| record = rbind.fill(record) | |
| if(length(record) > 0) { | |
| record = data.frame(id = i, record) | |
| record$formatList = sapply(record$formatList, sanitize) | |
| record$committeeList = sapply(record$committeeList, sanitize) | |
| record$voteExplanationList = sapply(record$voteExplanationList, sanitize) | |
| record = lapply(record, unlist) | |
| write.csv(record, file) | |
| } | |
| message(paste("Scraped: MEP", i, length(files) - which(files == i), "left")) | |
| } | |
| } |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| get_cre <- function(id, leg = 7, verbose = TRUE) { | |
| rec = data.frame() | |
| idx = 0 | |
| while(idx > -1) { | |
| x = paste0("http://www.europarl.europa.eu/meps/en/", id, | |
| "/see_more.html?type=CRE&leg=", leg, "&index=", idx) | |
| x = try(fromJSON(readLines(x, warn = FALSE), flatten = TRUE)) | |
| if("try-error" %in% class(x)) { | |
| warning("Scraper error: MEP ", id) | |
| } else { | |
| idx = x$nextIndex | |
| if(class(x$documentList) == "data.frame") | |
| rec = rbind(rec, cbind(leg, x$documentList)) | |
| if(!idx) | |
| idx = -1 | |
| } | |
| } | |
| return(rec) | |
| } |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| data = "meps.csv" | |
| if(!file.exists(data)) { | |
| html = "http://www.europarl.europa.eu/meps/en/directory.html?filter=all&leg=" | |
| html = htmlParse(html, encoding = "UTF-8") | |
| root = function(x) paste0("//div[@class='zone_info_mep']/div[@class='mep_details']/ul/li", x) | |
| link = xpathSApply(html, root("[@class='mep_name']/a/@href")) | |
| name = xpathSApply(html, root("[@class='mep_name']")) | |
| name = sapply(name, xmlValue) | |
| write.csv(data.frame(link, name), data) | |
| } | |
| data = read.csv(data, stringsAsFactors = FALSE) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| sanitize <- function(x) return(ifelse(length(x) < 1, NA, paste0(x, collapse = ";"))) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment