Skip to content

Instantly share code, notes, and snippets.

@yihui
Created July 3, 2017 19:07
Show Gist options
  • Save yihui/d2e122eddc7d695b16f4e0a49cfe4f8a to your computer and use it in GitHub Desktop.
Save yihui/d2e122eddc7d695b16f4e0a49cfe4f8a to your computer and use it in GitHub Desktop.
The R script I used to scrape the old RViews site (WordPress)
library(xml2)
page_post_links = function(
page = 1, baseurl = 'https://www.rstudio.com/rviews',
xpath = '//h2[@class="entry-title"]/a'
) {
html = read_html(sprintf('%s/page/%d/', baseurl, page))
xml_attr(xml_find_all(html, xpath), 'href')
}
post_links = function(...) {
all_links = character()
i = 1
repeat {
message('Reading page ', i)
links = page_post_links(i, ...)
if (length(links) == 0) break
all_links = c(all_links, links)
i = i + 1
}
all_links
}
page_md = function(link, dir = 'post') {
html = read_html(link)
get_content = function(xpath, extract = xml_text, ..., quote = TRUE) {
res = trimws(extract(xml_find_first(html, xpath), ...))
if (quote) sprintf('"%s"', res) else res
}
get_sub = function(xpath, reg) {
x = grep(reg, xml_text(xml_find_all(html, xpath)), value = TRUE)
if (length(x) > 0) gsub(reg, '\\1', x[1]) else character()
}
get_body = function() {
content = xml_find_first(html, '//div[@class="post-content"]')
content_children = xml_children(content)
if (length(content_children) > 2) {
if (xml_text(content_children[1]) == '') xml_remove(content_children[1])
if (grepl('^by ', xml_text(content_children[2]))) xml_remove(content_children[2])
}
tmp = tempfile('tmp', '.', '.html')
tmp_md = blogdown:::with_ext(tmp, 'md')
on.exit(unlink(c(tmp, tmp_md)), add = TRUE)
blogdown:::writeUTF8(
gsub('^<div [^>]+>\\s*|\\s*</div>\\s*$', '', as.character(content)), tmp
)
system2('pandoc', c(tmp, '-o', tmp_md, '--wrap=none'))
res = gsub(
'\\s*<(div|span)[^>]*?>\\s*|\\s*</(div|span)>\\s*', '',
blogdown:::readUTF8(tmp_md)
)
res = gsub('``` {.sourceCode .r}', '```r', res, fixed = TRUE)
# remove consecutive empty lines
gsub('\\s*\n\n+\\s*', '\n\n', paste(trimws(res), collapse = '\n'))
}
meta = unlist(list(
title = get_content('//h2[@class="entry-title"]'),
author = get_content('//a[@rel="author"]'),
date = get_content('//span[@class="updated"]', quote = FALSE),
slug = basename(link),
categories = sprintf(
'[%s]', get_sub('//script[text()]', '.*?"category":"([^"]+)".*')
),
tags = NULL#,
# description = gsub(
# '(^by .+?\r\n)|\n+|\r+', '',
# get_content('//meta[@property="og:description"]', xml_attr, 'content')
# )
))
if (!dir.exists(dir)) dir.create(dir)
blogdown:::writeUTF8(
c('---', sprintf('%s: %s', names(meta), meta), '---', '', get_body()),
file.path(dir, sprintf('%s-%s.md', as.Date(meta['date']), meta['slug']))
)
}
for (link in post_links()) {
message('Converting ', link)
page_md(link)
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment