Created
December 29, 2012 08:09
-
-
Save bluishoul/4405363 to your computer and use it in GitHub Desktop.
Spider
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package common; | |
import java.io.BufferedReader; | |
import java.io.IOException; | |
import java.io.InputStream; | |
import java.io.InputStreamReader; | |
import java.net.HttpURLConnection; | |
import java.net.URL; | |
import java.util.ArrayList; | |
import java.util.List; | |
import java.util.Properties; | |
import org.apache.commons.codec.StringDecoder; | |
import org.apache.commons.httpclient.HttpClient; | |
import org.apache.commons.httpclient.HttpStatus; | |
import org.apache.commons.httpclient.NameValuePair; | |
import org.apache.commons.httpclient.methods.PostMethod; | |
import org.apache.commons.httpclient.params.HttpMethodParams; | |
import org.apache.commons.lang.SystemUtils; | |
import org.jsoup.Jsoup; | |
import org.jsoup.nodes.Document; | |
import org.jsoup.nodes.Element; | |
import org.jsoup.select.Elements; | |
/** | |
* @author Administrator | |
*/ | |
public class Spider { | |
public static String OSC_ARTICLE_POST_URL = "http://localhost/action/spider/postArticle"; | |
public final static String USER_AGNET = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.19 (KHTML, like Gecko) Chrome/25.0.1321.0 Safari/537.19"; | |
private static final String REMOVE_PREFIX = "[置顶]"; | |
/** | |
* 代理服务器的地址 | |
*/ | |
private static String proxyHost; | |
/** | |
* 代理服务器的端口 | |
*/ | |
private static String proxyPort; | |
/** | |
* 代理服务器用户名 | |
*/ | |
private static String proxyUser; | |
/** | |
* 代理服务器密码 | |
*/ | |
private static String proxyPassword; | |
/** | |
* 网页抓取方法 | |
* | |
* @param urlString 要抓取的url地址 | |
* @param charset 网页编码方式 | |
* @param timeout 超时时间 | |
* @return 抓取的网页内容 | |
* @throws IOException 抓取异常 | |
*/ | |
public static String GetWebContent(String urlString, final String charset, | |
int timeout) throws IOException { | |
if (urlString == null || urlString.length() == 0) { | |
return null; | |
} | |
urlString = (urlString.startsWith("http://") || urlString | |
.startsWith("https://")) ? urlString : ("http://" + urlString) | |
.intern(); | |
URL url = new URL(urlString); | |
HttpURLConnection conn = (HttpURLConnection) url.openConnection(); | |
GetProxy(); | |
conn.setRequestProperty( | |
"User-Agent", | |
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.52 Safari/536.5");// 增加报头,模拟浏览器,防止屏蔽 | |
conn.setRequestProperty("Accept", "text/html");// 只接受text/html类型,当然也可以接受图片,pdf,*/*任意,就是tomcat/conf/web里面定义那些 | |
conn.setConnectTimeout(timeout); | |
try { | |
if (conn.getResponseCode() != HttpURLConnection.HTTP_OK) { | |
return "请坚持网络连接"; | |
} | |
} catch (IOException e) { | |
System.err.println("网络出现问题"); | |
return null; | |
} | |
InputStream input = conn.getInputStream(); | |
BufferedReader reader = new BufferedReader(new InputStreamReader(input, | |
charset)); | |
String line = null; | |
StringBuffer sb = new StringBuffer(); | |
while ((line = reader.readLine()) != null) { | |
sb.append(line).append("\r\n"); | |
} | |
if (reader != null) { | |
reader.close(); | |
} | |
if (conn != null) { | |
conn.disconnect(); | |
} | |
return sb.toString(); | |
} | |
public static Document GetDocument(String...url) throws IOException { | |
// System.out.println("Fetching "+url); | |
String html = Spider.GetWebContent(url[0], url.length==1?"UTF-8":url[1]); | |
return Jsoup.parse(html); | |
} | |
/** | |
* 网页抓取方法 | |
* | |
* @param urlString 要抓取的url地址 | |
* @return 抓取的网页内容 | |
* @throws IOException 抓取异常 | |
*/ | |
public static String GetWebContent(String urlString) throws IOException { | |
return GetWebContent(urlString, "iso-8859-1", 5000); | |
} | |
/** | |
* 网页抓取方法 | |
* | |
* @param urlString 要抓取的url地址 | |
* @param pageCharset 目标网页编码方式 | |
* @return 抓取的网页内容 | |
* @throws IOException 抓取异常 | |
*/ | |
public static String GetWebContent(String urlString, String pageCharset) | |
throws IOException { | |
String strHTML = GetWebContent(urlString, pageCharset, 5000); | |
if ("".equals(strHTML) || strHTML == null) { | |
return urlString; | |
} | |
return strHTML; | |
} | |
/** | |
* 设定代理服务器 | |
* | |
* @param proxyHost | |
* @param proxyPort | |
*/ | |
public static void SetProxy(String proxyHost, String proxyPort) { | |
SetProxy(proxyHost, proxyPort, null, null); | |
} | |
/** | |
* 设定代理服务器 | |
* | |
* @param sproxyHost 代理服务器的地址 | |
* @param sproxyPort 代理服务器的端口 | |
* @param sproxyUser 代理服务器用户名 | |
* @param sproxyPassword 代理服务器密码 | |
*/ | |
public static void SetProxy(String sproxyHost, String sproxyPort, | |
String sproxyUser, String sproxyPassword) { | |
proxyHost = sproxyHost; | |
proxyPort = sproxyPort; | |
if (sproxyPassword != null && sproxyPassword.length() > 0) { | |
proxyUser = sproxyUser; | |
proxyPassword = sproxyPassword; | |
} | |
} | |
/** | |
* 取得代理设定 | |
* | |
* @return | |
*/ | |
private static Properties GetProxy() { | |
Properties propRet = null; | |
if (proxyHost != null && proxyHost.length() > 0) { | |
propRet = System.getProperties(); | |
// 设置http访问要使用的代理服务器的地址 | |
propRet.setProperty("http.proxyHost", proxyHost); | |
// 设置http访问要使用的代理服务器的端口 | |
propRet.setProperty("http.proxyPort", proxyPort); | |
if (proxyUser != null && proxyUser.length() > 0) { | |
// 用户名密码 | |
propRet.setProperty("http.proxyUser", proxyUser); | |
propRet.setProperty("http.proxyPassword", proxyPassword); | |
} | |
} | |
return propRet; | |
} | |
/** | |
* @param html | |
* @param className | |
* @return | |
*/ | |
public static List<Element> GetElementByClassName(String html, | |
String className) { | |
Document doc = Jsoup.parse(html); | |
List<Element> list = new ArrayList<Element>(); | |
Elements elms = doc.getElementsByClass(className); | |
for (int i = 0; i < elms.size(); i++) { | |
list.add(elms.get(i)); | |
} | |
return list; | |
} | |
public static String postToOschina(int uid, Article art, String module) { | |
HttpClient client = new HttpClient(); | |
client.getParams().setParameter(HttpMethodParams.HTTP_CONTENT_CHARSET, | |
"UTF-8"); | |
client.getParams().setParameter(HttpMethodParams.USER_AGENT, USER_AGNET); | |
PostMethod pm = new PostMethod(OSC_ARTICLE_POST_URL); | |
String title = art.getTitle(); | |
title = HtmlUtil.filterUserInputContent(title).replace(REMOVE_PREFIX, | |
""); | |
NameValuePair[] article = {new NameValuePair("uid", uid + ""), | |
new NameValuePair("title", title), | |
new NameValuePair("link", art.getLink()), | |
new NameValuePair("content", art.getContent()), | |
new NameValuePair("module", module)}; | |
pm.setRequestBody(article); | |
String info = ""; | |
try { | |
client.executeMethod(pm); | |
int code = pm.getStatusCode(); | |
if (code == HttpStatus.SC_OK){ | |
info = new String(pm.getResponseBodyAsString()); | |
System.err.println("Spider:" + info); | |
} | |
System.out.println("Spider:the post return value" | |
+ pm.getStatusLine()); | |
pm.releaseConnection(); | |
} catch (Exception e) { | |
e.printStackTrace(); | |
return ""; | |
} | |
return info; | |
} | |
public static void main(String[] args) throws Exception { | |
System.out.println(Spider.GetDocument("http://tieba.baidu.com/index.html")); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment