Last active
December 24, 2015 23:19
-
-
Save panfeng/6879756 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(XML) | |
#获取当前链接的所有评论 | |
get_one_page_comment<-function(url){ | |
html<-htmlParse(url) | |
resnodes<-getNodeSet(html,"//div[@class='review-sec']//div[@class='inner']//div[@class='body']//p[@class='read']//span[@class='read-more']//a[@class='cmn-viewmore']") | |
#获取当前页面中所有comment的长文链接 | |
comments_url<-NULL | |
if(is.null(resnodes)){ | |
return(NULL) }else{ | |
for(i in 1:length(resnodes)){ | |
comments_url[i]<-xmlGetAttr(resnodes[[i]],name="href") | |
} } | |
#获取commets_url中的单一页面评论 | |
comments_node<-NULL | |
comments<-NULL | |
for (i in 1:length(resnodes)){ | |
comments_node[i]<-getNodeSet(htmlParse(comments_url[i]),"//p[@class='read']") | |
comments[i]<-xmlValue(comments_node[[i]]) } | |
return(comments) | |
} | |
# 获取产品页面总评论页数 | |
get_comments_page_num<-function(url){ | |
html<-htmlParse(url) | |
page_num_end<-getNodeSet(html,"//div[@class='cmn-paging clearfix']//ul[@class='clearfix']//li[last()-1]//a")[[1]] | |
end_num<-as.numeric(xmlValue(page_num_end)) | |
return(end_num) | |
} | |
# http://www.cosme.net/product/product_id/10010831/reviews | |
comment_all<-function(url){ | |
end_num<-get_comments_page_num(url) | |
urlpart<-strsplit(url,"/") | |
product_id<-urlpart[[1]][6] | |
urlpre<-paste0("http://www.cosme.net/product/product_id/",product_id,"/reviews/p/") | |
comments_all=NULL | |
as.list(comments_all) | |
for(i in 1:end_num){ | |
print(paste0("正在获取第",i,"页所有评论")) | |
a<-paste0(urlpre,i) | |
b<-get_one_page_comment(a) | |
gsub(pattern = "\n", replacement = "",b) # 正则表达去除 \n | |
comments_all[[i]]<-b | |
} | |
return(comments_all) | |
} | |
##################################################################################################### | |
# 注意网址结构 /ProductID/reviews/ | |
url<-"http://www.cosme.net/product/product_id/10010831/reviews/" | |
# 表示第一页 | |
# http://www.cosme.net/product/product_id/10010831/reviews/p/1 | |
# 如果网址像这样的话http://www.cosme.net/product/product_id/10010831/reviews/p/1 | |
# 要改成http://www.cosme.net/product/product_id/10010831/reviews/p/ | |
(comments<-comment_all(url)) | |
# comments为表单结构 | |
#comments[[a]][b] 表示第a页 第b条评论 | |
#在工作区间可以找到 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment