Created
July 24, 2014 12:29
-
-
Save briatte/507140640814cb0b7e31 to your computer and use it in GitHub Desktop.
download all asset declarations from French MPs, July 2014
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# parse XPath syntax from well-formed HTML | |
library(XML) | |
# complete archive will take ~ 1.4 GB on disk | |
dir.create("declarations", showWarnings = FALSE) | |
# finds 941 MPs on 2014-07-24 at website launch | |
h = htmlParse("http://www.hatvp.fr/consulter-les-declarations-rechercher.html") | |
h = paste0("http://www.hatvp.fr/", xpathSApply(h, "//div[@id='annuaire']/*/*/*/a/@href")) | |
for(i in h) { | |
cat(sprintf("%0.3g", length(h) - which(h == i)), i) | |
j = gsub("http://www.hatvp.fr/pages_nominatives", "declarations", i) | |
j = gsub("html$", "pdf", j) | |
# if no file in declarations folder matches the MP name in the URL | |
if(!length(dir("declarations", gsub("declarations/|.pdf", "", j)))) { | |
k = xpathSApply(htmlParse(i), "//a[contains(@href, '.pdf')]/@href") | |
# multiple, single or no declaration available | |
if(length(k) > 1) { | |
for(s in 1:length(k)) { | |
download.file(gsub("\\.\\.", "http://www.hatvp.fr", k[s]), | |
paste0(gsub(".pdf", "", j), "-", s, ".pdf"), quiet = TRUE) | |
} | |
cat(" [ downloaded", length(k), "files ]\n") | |
} else if(is.null(k)) { | |
cat(" [ no file ]\n") | |
} else { | |
download.file(gsub("\\.\\.", "http://www.hatvp.fr", k), j, quiet = TRUE) | |
cat(" [ downloaded", file.info(j)$size / 1000, "KB ]\n") | |
} | |
} else { | |
cat(" [ skipped ]\n") | |
} | |
} | |
f = dir("declarations", "pdf$", full.names = TRUE) | |
# a few MPs have an initial declaration and a correction for it | |
cat(length(unique(gsub("(-\\d)?.pdf$", "", f))), "MPs", length(f), "files", | |
round(sum(file.info(f)$size) / 10^9, 1), "GB\n") | |
# save a plain text file manifest with file sizes and creation dates | |
write.table(file.info(f)[, c("size", "ctime") ], | |
"declarations/manifest.txt", quote = FALSE) | |
# have a nice day |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment