Skip to content

Instantly share code, notes, and snippets.

@caoxudong
Last active August 29, 2015 13:57
Show Gist options
  • Save caoxudong/9527644 to your computer and use it in GitHub Desktop.
Save caoxudong/9527644 to your computer and use it in GitHub Desktop.
网页解析工具,基于jsoup解析html
package xumin;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
import org.jsoup.Connection;
import org.jsoup.Connection.Response;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
/**
*
* 解析目标网页,获取kayak分类下可供选择的房间类型的数量。目标网页将实际网页嵌入其中展示,因此需要对实际网页进行解析。
*
* 目标网页: http://www.kayak.com.hk/hotels/InterContinental-Grand-Stanford-Hong-Kong,Hong-Kong,Hong-Kong-c23190-h49497-details/2014-07-01/2014-07-02/2guests/expanded/#overview
*
* 实际网页: http://www.kayak.com.hk/h/run/hoteldetails/ajax/overview/rates?hid=49497&searchid={searchId}IbEEDFN4sZ
*
* NOTE:实际网页中的参数searchId需要从目标网页中解析出来,这个参数过一段时间后会过期,所以每次都要解析一下
*
* @author caoxudong
*
*/
public class FetchRoomTypes {
private static final String HEADER_USER_AGENT = "Mozilla/5.0 (Windows NT "
+ "6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) "
+ "Chrome/33.0.1750.146 Safari/537.36";
private static final String HEADER_HOST = "www.kayak.com.hk";
private static final String BASE_URL = "http://www.kayak.com.hk/hotels/"
+ "InterContinental-Grand-Stanford-Hong-Kong,Hong-Kong,Hong-Kong"
+ "-c23190-h49497-details/2014-07-01/2014-07-02/2guests/expanded/"
+ "#overview";
private static final String BASE_KEY_STRING =
"R9.RP.Hotels.Details.AjaxDetailsSections";
private static final String ACTUAL_URL = "http://www.kayak.com.hk/h/run/"
+ "hoteldetails/ajax/overview/rates?hid=49497&searchid=";
private static final Map<String, String> NAME_MAP = new HashMap<String, String>(){
private static final long serialVersionUID = 1L;
{
this.put("pvd1p2", "kayak");
this.put("pvd2p2", "booking.com");
this.put("pvd3p2", "expedia");
this.put("pvd4p2", "hotels.com");
}
};
public static void main(String[] args) throws IOException {
Connection connection = null;
Response response = null;
String[] substrings = null;
Document document = null;
Map<String, String> result = new HashMap<>();
//先解析基础页面,以找出searchId,该值会在一段时间后过期,因此每次都要解析一下
connection = Jsoup.connect(BASE_URL);
connection.header("Host", HEADER_HOST);
connection.get();
response = connection.response();
connection.userAgent(HEADER_USER_AGENT);
String responseBody = response.body();
substrings = responseBody.split(BASE_KEY_STRING);
if ((substrings == null) || (substrings.length != 2)) {
System.err.println("resolve failed, maybe base page changed. "
+ "please contact caoxudong");
} else {
String targetStr = substrings[1];
/**
* 这里targetStr的内容应该是包含了searchId及其后所有的内容,例如
* ("49497", "IPAEA-fQoi", "overview");});jq(function() {if(typeof R9.RP.Hotels.Details.PrettyUrl != "undefined") {。。。。。。
*
* 这里按照英文双引号分割出searchId即可
*/
substrings = targetStr.split("\"", 5);
String searchId = substrings[3];
String actualUrl = null;
actualUrl = ACTUAL_URL + searchId;
connection = Jsoup.connect(actualUrl);
connection.userAgent(HEADER_USER_AGENT);
connection.header("Host", HEADER_HOST);
document = connection.get();
Elements elements = document.select("tbody.providerEntry");
for (int i=0; i<elements.size(); i++) {
Element element = elements.get(i);
String id = element.id();
Elements aTags = element.select("span.showRoomsLink a");
if ((aTags == null) || (aTags.size() != 1)) {
System.err.println("resolve failed, maybe base page changed. "
+ "please contact caoxudong");
break;
} else {
Element aTag = aTags.get(0);
String aTagContent = aTag.text(); //e.g. "show 7 more room types"
result.put(id, aTagContent.split(" ", 3)[1]);
}
}
}
//print result
for (Map.Entry<String, String> entry: result.entrySet()) {
System.out.println(NAME_MAP.get(entry.getKey()) + ": " + entry.getValue());
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment