Created
January 24, 2016 17:21
-
-
Save mrdwab/07f5239403e6cd61d9a2 to your computer and use it in GitHub Desktop.
Downloads the images that comprise an issuu publication, uses ImageMagick to convert the jpegs to a PDF. Somewhat glitchy, and not the most stable since it relies on item positions in at least two places (but could probably be rewritten to avoid that).
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| library(rvest) | |
| library(jsonlite) | |
| issuu <- function(url) { | |
| doc <- read_html(url) | |
| props <- doc %>% | |
| html_nodes("script") %>% | |
| .[[4]] %>% | |
| html_text() %>% | |
| gsub("window.issuuDataCache = ", "", .) %>% | |
| fromJSON() %>% | |
| .[["apiCache"]] %>% | |
| .[[2]] %>% | |
| .[["document"]] | |
| fl <- sprintf("http://image.issuu.com/%s/jpg/page_%d.jpg", props[["documentId"]], seq_len(props[["pageCount"]])) | |
| cwd <- getwd() | |
| x <- tempdir() | |
| print(x) | |
| setwd(x) | |
| writeLines(fl, con = "filelist.txt") | |
| system(sprintf("wget -i filelist.txt", x), show.output.on.console = FALSE, ignore.stdout = TRUE, ignore.stderr = TRUE) | |
| output <- paste0(gsub("\\s+", "_", props[["title"]]), ".pdf") | |
| system(sprintf("convert %s %s", paste(basename(fl), collapse = " "), output)) | |
| file.remove(basename(fl)) | |
| file.remove("filelist.txt") | |
| file.copy(output, "~/") | |
| file.remove(output) | |
| setwd(cwd) | |
| getwd() | |
| props | |
| } | |
| ## Example run.... | |
| issuu("http://issuu.com/amnestypublishing/docs/escr_primer") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment