mutolisp · December 28, 2018 03:12
diff --git a/20181228_rvest.r b/20181228_rvest.r

 # load rvest package
 library(rvest)
 prefix <- 'http://www.ncyu.edu.tw'
 acaLawPage <- 'http://www.ncyu.edu.tw/academic/law_list.aspx?pages='

 totalPageCount <- paste(acaLawPage, 0, sep = "") %>% 
  read_html()  %>% # use read_html to read out URL
  html_nodes(xpath = '//span[@id="pagecount"]') %>% # find the path of pagecount
  html_text() %>% # convert it into plain text
  as.numeric() # convert plain text to numeric value

 storedPages <- list()
 for ( i in 0:(totalPageCount-1) )  {
  storedPages[[i+1]] <- paste(acaLawPage, i, sep = "") %>% 
    read_html()
 }
 ## 先找到 title
 pageTitle <- storedPages[[1]] %>% 
  html_nodes(xpath = '//table[@class="index_bg02"]//td[@width="40%"]//a//span') %>%
  html_text()

 fileDate <- storedPages[[1]] %>%
  html_nodes(xpath = '//table[@class="index_bg02"]//td[@width="15%"]') %>%
  html_text() 
 fileDate <- fileDate[grep('[0-9]', fileDate)]

 fileURL <- storedPages[[1]] %>%
  html_nodes(xpath = '//table[@class="index_bg02"]//td[@width="40%"]//a') %>%
  html_attr('href')

 fileURL <- paste(prefix, fileURL, sep='')
 fileURL

 pgs <- data.frame()
 for ( i in 1:totalPageCount ){
  pageTitle <- storedPages[[i]] %>% 
    html_nodes(xpath = '//table[@class="index_bg02"]//td[@width="40%"]//a//span') %>%
    html_text()
  
  fileDate <- storedPages[[i]] %>%
    html_nodes(xpath = '//table[@class="index_bg02"]//td[@width="15%"]') %>%
    html_text() 
  fileDate <- fileDate[grep('[0-9]', fileDate)]
  
  fileURL <- storedPages[[i]] %>%
    html_nodes(xpath = '//table[@class="index_bg02"]//td[@width="40%"]//a') %>%
    html_attr('href')
  fileURL <- paste(prefix, fileURL, sep = '')
  
  currentPage <- cbind(fileDate, pageTitle, fileURL)
  pgs <- rbind(pgs, currentPage)
  
 }
 library(curl)
 for ( i in 1:10 ) {
  paste('Downloading', pgs[i,1], '(', round(i/10*100,1),'%)') %>%
   print()
  as.character(pgs[i,3]) %>% 
    URLencode() %>%
    curl_download(paste('/tmp/ncyu_aca_affairs/', pgs[i,1], '_', 
                        pgs[i,2], '.pdf', sep = ''))
 }

	# load rvest package
	library(rvest)
	prefix <- 'http://www.ncyu.edu.tw'
	acaLawPage <- 'http://www.ncyu.edu.tw/academic/law_list.aspx?pages='

	totalPageCount <- paste(acaLawPage, 0, sep = "") %>%
	read_html() %>% # use read_html to read out URL
	html_nodes(xpath = '//span[@id="pagecount"]') %>% # find the path of pagecount
	html_text() %>% # convert it into plain text
	as.numeric() # convert plain text to numeric value

	storedPages <- list()
	for ( i in 0:(totalPageCount-1) ) {
	storedPages[[i+1]] <- paste(acaLawPage, i, sep = "") %>%
	read_html()
	}
	## 先找到 title
	pageTitle <- storedPages[[1]] %>%
	html_nodes(xpath = '//table[@class="index_bg02"]//td[@width="40%"]//a//span') %>%
	html_text()

	fileDate <- storedPages[[1]] %>%
	html_nodes(xpath = '//table[@class="index_bg02"]//td[@width="15%"]') %>%
	html_text()
	fileDate <- fileDate[grep('[0-9]', fileDate)]

	fileURL <- storedPages[[1]] %>%
	html_nodes(xpath = '//table[@class="index_bg02"]//td[@width="40%"]//a') %>%
	html_attr('href')

	fileURL <- paste(prefix, fileURL, sep='')
	fileURL

	pgs <- data.frame()
	for ( i in 1:totalPageCount ){
	pageTitle <- storedPages[[i]] %>%
	html_nodes(xpath = '//table[@class="index_bg02"]//td[@width="40%"]//a//span') %>%
	html_text()

	fileDate <- storedPages[[i]] %>%
	html_nodes(xpath = '//table[@class="index_bg02"]//td[@width="15%"]') %>%
	html_text()
	fileDate <- fileDate[grep('[0-9]', fileDate)]

	fileURL <- storedPages[[i]] %>%
	html_nodes(xpath = '//table[@class="index_bg02"]//td[@width="40%"]//a') %>%
	html_attr('href')
	fileURL <- paste(prefix, fileURL, sep = '')

	currentPage <- cbind(fileDate, pageTitle, fileURL)
	pgs <- rbind(pgs, currentPage)

	}
	library(curl)
	for ( i in 1:10 ) {
	paste('Downloading', pgs[i,1], '(', round(i/10*100,1),'%)') %>%
	print()
	as.character(pgs[i,3]) %>%
	URLencode() %>%
	curl_download(paste('/tmp/ncyu_aca_affairs/', pgs[i,1], '_',
	pgs[i,2], '.pdf', sep = ''))
	}