Skip to content

Instantly share code, notes, and snippets.

@mutolisp
Created December 28, 2018 03:12
Show Gist options
  • Save mutolisp/d84e004a088184e9eea4876dd20d9861 to your computer and use it in GitHub Desktop.
Save mutolisp/d84e004a088184e9eea4876dd20d9861 to your computer and use it in GitHub Desktop.
R 爬蟲範例
# load rvest package
library(rvest)
prefix <- 'http://www.ncyu.edu.tw'
acaLawPage <- 'http://www.ncyu.edu.tw/academic/law_list.aspx?pages='
totalPageCount <- paste(acaLawPage, 0, sep = "") %>%
read_html() %>% # use read_html to read out URL
html_nodes(xpath = '//span[@id="pagecount"]') %>% # find the path of pagecount
html_text() %>% # convert it into plain text
as.numeric() # convert plain text to numeric value
storedPages <- list()
for ( i in 0:(totalPageCount-1) ) {
storedPages[[i+1]] <- paste(acaLawPage, i, sep = "") %>%
read_html()
}
## 先找到 title
pageTitle <- storedPages[[1]] %>%
html_nodes(xpath = '//table[@class="index_bg02"]//td[@width="40%"]//a//span') %>%
html_text()
fileDate <- storedPages[[1]] %>%
html_nodes(xpath = '//table[@class="index_bg02"]//td[@width="15%"]') %>%
html_text()
fileDate <- fileDate[grep('[0-9]', fileDate)]
fileURL <- storedPages[[1]] %>%
html_nodes(xpath = '//table[@class="index_bg02"]//td[@width="40%"]//a') %>%
html_attr('href')
fileURL <- paste(prefix, fileURL, sep='')
fileURL
pgs <- data.frame()
for ( i in 1:totalPageCount ){
pageTitle <- storedPages[[i]] %>%
html_nodes(xpath = '//table[@class="index_bg02"]//td[@width="40%"]//a//span') %>%
html_text()
fileDate <- storedPages[[i]] %>%
html_nodes(xpath = '//table[@class="index_bg02"]//td[@width="15%"]') %>%
html_text()
fileDate <- fileDate[grep('[0-9]', fileDate)]
fileURL <- storedPages[[i]] %>%
html_nodes(xpath = '//table[@class="index_bg02"]//td[@width="40%"]//a') %>%
html_attr('href')
fileURL <- paste(prefix, fileURL, sep = '')
currentPage <- cbind(fileDate, pageTitle, fileURL)
pgs <- rbind(pgs, currentPage)
}
library(curl)
for ( i in 1:10 ) {
paste('Downloading', pgs[i,1], '(', round(i/10*100,1),'%)') %>%
print()
as.character(pgs[i,3]) %>%
URLencode() %>%
curl_download(paste('/tmp/ncyu_aca_affairs/', pgs[i,1], '_',
pgs[i,2], '.pdf', sep = ''))
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment