Created
January 21, 2015 09:53
-
-
Save v5tech/5145059b89d8d040c8c9 to your computer and use it in GitHub Desktop.
webmagic爬虫示例
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import java.util.ArrayList; | |
import java.util.List; | |
import us.codecraft.webmagic.Page; | |
import us.codecraft.webmagic.Site; | |
import us.codecraft.webmagic.Spider; | |
import us.codecraft.webmagic.processor.PageProcessor; | |
/** | |
* 爬取http://www.tianyisw.com/ 政策法规数据 | |
* @author welcome | |
* | |
*/ | |
public class CrawlZCFG implements PageProcessor { | |
private Site site = Site.me()/*.setDomain("www.tianyisw.com")*/ | |
.setRetryTimes(3) | |
.setSleepTime(1000) | |
.setUserAgent("Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36"); | |
// 用于匹配网页正文中分页链接 | |
private static final String url = "http://www\\.tianyisw\\.com/Policies\\.asp\\?owen1=政策法规&owen2=&page=\\d+"; | |
// 列表页请求地址 | |
private static final String URL_LIST = "http://www\\.tianyisw.com/Policies\\.asp\\?owen1=%D5%FE%B2%DF%B7%A8%B9%E6&owen2=&page=\\d+"; | |
// 详情页请求地址 | |
private static final String URL_POST = "http://www\\.tianyisw\\.com/show_Policies\\.asp\\?id=\\d+"; | |
@Override | |
public void process(Page page) { | |
// 若请求地址为列表页 | |
if(page.getUrl().regex(URL_LIST).match()){ | |
// 添加详情地址到爬虫请求列表中 | |
page.addTargetRequests(page.getHtml().links().regex(URL_POST).all()); | |
// 处理匹配到的分页请求链接 | |
List<String> list = page.getHtml().links().regex(url).all(); | |
ArrayList<String> lst = new ArrayList<String>(); | |
for (String string : list) { | |
// 处理地址中的中文 | |
String url = string.replace("政策法规", "%D5%FE%B2%DF%B7%A8%B9%E6"); | |
lst.add(url); | |
} | |
// 将分页请求链接地址添加到爬虫请求列表中 | |
page.addTargetRequests(lst); | |
}else{ | |
// 详情页数据解析 为详情地址所指向的内容页 | |
page.putField("title", page.getHtml().xpath("//table/tbody/tr/td[@class=\"tit\"]/strong/text()").toString()); | |
page.putField("body", page.getHtml().xpath("//td[@class=\"hanggao\"]").toString()); | |
} | |
} | |
@Override | |
public Site getSite() { | |
return site; | |
} | |
public static void main(String[] args) { | |
Spider.create(new CrawlZCFG()) | |
// 爬虫初始地址 | |
.addUrl("http://www.tianyisw.com/Policies.asp?owen1=%D5%FE%B2%DF%B7%A8%B9%E6&owen2=&page=1") | |
.thread(5) | |
.run(); | |
} | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import java.util.ArrayList; | |
import java.util.List; | |
import us.codecraft.webmagic.Page; | |
import us.codecraft.webmagic.Site; | |
import us.codecraft.webmagic.Spider; | |
import us.codecraft.webmagic.processor.PageProcessor; | |
/** | |
* 爬取http://www.tianyisw.com/ 政策解读数据 | |
* @author welcome | |
* | |
*/ | |
public class CrawlZCJD implements PageProcessor { | |
private Site site = Site.me()/*.setDomain("www.tianyisw.com")*/ | |
.setRetryTimes(3) | |
.setSleepTime(1000) | |
.setUserAgent("Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36"); | |
// 用于匹配网页正文中分页链接 | |
private static final String url = "http://www\\.tianyisw\\.com/Policies\\.asp\\?owen1=政策法规&owen2=政策解读&page=\\d+"; | |
// 列表页请求地址 | |
private static final String URL_LIST = "http://www\\.tianyisw\\.com/Policies\\.asp\\?owen1=%D5%FE%B2%DF%B7%A8%B9%E6&owen2=%D5%FE%B2%DF%BD%E2%B6%C1&page=\\d+"; | |
// 详情页请求地址 | |
private static final String URL_POST = "http://www\\.tianyisw\\.com/show_Policies\\.asp\\?id=\\d+"; | |
@Override | |
public void process(Page page) { | |
// 若请求地址为列表页 | |
if(page.getUrl().regex(URL_LIST).match()){ | |
// 添加详情地址到爬虫请求列表中 | |
page.addTargetRequests(page.getHtml().links().regex(URL_POST).all()); | |
// 处理匹配到的分页请求链接 | |
List<String> list = page.getHtml().links().regex(url).all(); | |
ArrayList<String> lst = new ArrayList<String>(); | |
for (String string : list) { | |
// 处理地址中的中文 | |
String url = string.replace("政策法规", "%D5%FE%B2%DF%B7%A8%B9%E6"); | |
url = url.replace("政策解读", "%D5%FE%B2%DF%BD%E2%B6%C1"); | |
lst.add(url); | |
} | |
// 将分页请求链接地址添加到爬虫请求列表中 | |
page.addTargetRequests(lst); | |
}else{ | |
// 详情页数据解析 为详情地址所指向的内容页 | |
page.putField("title", page.getHtml().xpath("//table/tbody/tr/td[@class=\"tit\"]/strong/text()").toString()); | |
page.putField("body", page.getHtml().xpath("//td[@class=\"hanggao\"]").toString()); | |
} | |
} | |
@Override | |
public Site getSite() { | |
return site; | |
} | |
public static void main(String[] args) { | |
Spider.create(new CrawlZCJD()) | |
// 爬虫初始地址 | |
.addUrl("http://www.tianyisw.com/Policies.asp?owen1=%D5%FE%B2%DF%B7%A8%B9%E6&owen2=%D5%FE%B2%DF%BD%E2%B6%C1&page=1") | |
.thread(5) | |
.run(); | |
} | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import java.util.List; | |
import us.codecraft.webmagic.Page; | |
import us.codecraft.webmagic.Site; | |
import us.codecraft.webmagic.Spider; | |
import us.codecraft.webmagic.pipeline.ConsolePipeline; | |
import us.codecraft.webmagic.processor.PageProcessor; | |
/** | |
* webmagic垂直爬虫爬取深圳医院数据 | |
* 测试类 | |
* @author welcome | |
* | |
*/ | |
public class HospitalCrawl implements PageProcessor | |
{ | |
public static final String URL_LIST = "http://sz\\.91160\\.com/search/index/p-\\d+\\.html"; | |
public static final String URL_POST = "http://sz\\.91160\\.com/unit/show/uid-\\w+\\.html"; | |
private Site site = Site | |
.me() | |
.setDomain("sz.91160.com") | |
.setSleepTime(3000) | |
.setUserAgent("Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36"); | |
@Override | |
public void process(Page page) { | |
//列表页 | |
if (page.getUrl().regex(URL_LIST).match()) { | |
page.addTargetRequests(page.getHtml().xpath("//div[@class=\"search_list layout\"]//ul").links().regex(URL_POST).all()); | |
page.addTargetRequests(page.getHtml().links().regex(URL_LIST).all()); | |
} | |
else | |
{ | |
page.putField("医院名称", page.getHtml().xpath("//div[2]/div[1]/h1/allText()")); | |
page.putField("别名", page.getHtml().xpath("//div[2]/div[2]/div/div/ul[1]/allText()")); | |
page.putField("地址", page.getHtml().xpath("//div[2]/div[2]/div/div/ul[2]/allText()")); | |
page.putField("电话", page.getHtml().xpath("//div[2]/div[2]/div/div/ul[3]/allText()")); | |
page.putField("官网", page.getHtml().xpath("//div[2]/div[2]/div/div/ul[4]/allText()")); | |
page.putField("科室", page.getHtml().xpath("//div[3]/div/div[1]/div[2]/div[2]")); | |
//获取科室信息 | |
List<String> list = page.getHtml().xpath("//div[3]/div/div[1]/div[2]/div[2]").links().regex("http://sz\\.91160\\.com/dep/show/depid-\\w+\\.html").all(); | |
for (String string : list) { | |
System.out.println(string); | |
} | |
} | |
} | |
@Override | |
public Site getSite() { | |
return site; | |
} | |
public static void main(String[] args) | |
{ | |
ConsolePipeline consolePipeline = new ConsolePipeline(); | |
Spider.create(new HospitalCrawl()).addUrl("http://sz.91160.com/search/index/p-2.html").addPipeline(consolePipeline).run(); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
依赖的jar如下: