Skip to content

Instantly share code, notes, and snippets.

@panfeng
Last active December 24, 2015 23:19
Show Gist options
  • Save panfeng/6879756 to your computer and use it in GitHub Desktop.
Save panfeng/6879756 to your computer and use it in GitHub Desktop.
library(XML)
#获取当前链接的所有评论
get_one_page_comment<-function(url){
html<-htmlParse(url)
resnodes<-getNodeSet(html,"//div[@class='review-sec']//div[@class='inner']//div[@class='body']//p[@class='read']//span[@class='read-more']//a[@class='cmn-viewmore']")
#获取当前页面中所有comment的长文链接
comments_url<-NULL
if(is.null(resnodes)){
return(NULL) }else{
for(i in 1:length(resnodes)){
comments_url[i]<-xmlGetAttr(resnodes[[i]],name="href")
} }
#获取commets_url中的单一页面评论
comments_node<-NULL
comments<-NULL
for (i in 1:length(resnodes)){
comments_node[i]<-getNodeSet(htmlParse(comments_url[i]),"//p[@class='read']")
comments[i]<-xmlValue(comments_node[[i]]) }
return(comments)
}
# 获取产品页面总评论页数
get_comments_page_num<-function(url){
html<-htmlParse(url)
page_num_end<-getNodeSet(html,"//div[@class='cmn-paging clearfix']//ul[@class='clearfix']//li[last()-1]//a")[[1]]
end_num<-as.numeric(xmlValue(page_num_end))
return(end_num)
}
# http://www.cosme.net/product/product_id/10010831/reviews
comment_all<-function(url){
end_num<-get_comments_page_num(url)
urlpart<-strsplit(url,"/")
product_id<-urlpart[[1]][6]
urlpre<-paste0("http://www.cosme.net/product/product_id/",product_id,"/reviews/p/")
comments_all=NULL
as.list(comments_all)
for(i in 1:end_num){
print(paste0("正在获取第",i,"页所有评论"))
a<-paste0(urlpre,i)
b<-get_one_page_comment(a)
gsub(pattern = "\n", replacement = "",b) # 正则表达去除 \n
comments_all[[i]]<-b
}
return(comments_all)
}
#####################################################################################################
# 注意网址结构 /ProductID/reviews/
url<-"http://www.cosme.net/product/product_id/10010831/reviews/"
# 表示第一页
# http://www.cosme.net/product/product_id/10010831/reviews/p/1
# 如果网址像这样的话http://www.cosme.net/product/product_id/10010831/reviews/p/1
# 要改成http://www.cosme.net/product/product_id/10010831/reviews/p/
(comments<-comment_all(url))
# comments为表单结构
#comments[[a]][b] 表示第a页 第b条评论
#在工作区间可以找到
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment