Last active
August 29, 2015 13:57
-
-
Save caoxudong/9527644 to your computer and use it in GitHub Desktop.
网页解析工具,基于jsoup解析html
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package xumin; | |
import java.io.IOException; | |
import java.util.HashMap; | |
import java.util.Map; | |
import org.jsoup.Connection; | |
import org.jsoup.Connection.Response; | |
import org.jsoup.Jsoup; | |
import org.jsoup.nodes.Document; | |
import org.jsoup.nodes.Element; | |
import org.jsoup.select.Elements; | |
/** | |
* | |
* 解析目标网页,获取kayak分类下可供选择的房间类型的数量。目标网页将实际网页嵌入其中展示,因此需要对实际网页进行解析。 | |
* | |
* 目标网页: http://www.kayak.com.hk/hotels/InterContinental-Grand-Stanford-Hong-Kong,Hong-Kong,Hong-Kong-c23190-h49497-details/2014-07-01/2014-07-02/2guests/expanded/#overview | |
* | |
* 实际网页: http://www.kayak.com.hk/h/run/hoteldetails/ajax/overview/rates?hid=49497&searchid={searchId}IbEEDFN4sZ | |
* | |
* NOTE:实际网页中的参数searchId需要从目标网页中解析出来,这个参数过一段时间后会过期,所以每次都要解析一下 | |
* | |
* @author caoxudong | |
* | |
*/ | |
public class FetchRoomTypes { | |
private static final String HEADER_USER_AGENT = "Mozilla/5.0 (Windows NT " | |
+ "6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) " | |
+ "Chrome/33.0.1750.146 Safari/537.36"; | |
private static final String HEADER_HOST = "www.kayak.com.hk"; | |
private static final String BASE_URL = "http://www.kayak.com.hk/hotels/" | |
+ "InterContinental-Grand-Stanford-Hong-Kong,Hong-Kong,Hong-Kong" | |
+ "-c23190-h49497-details/2014-07-01/2014-07-02/2guests/expanded/" | |
+ "#overview"; | |
private static final String BASE_KEY_STRING = | |
"R9.RP.Hotels.Details.AjaxDetailsSections"; | |
private static final String ACTUAL_URL = "http://www.kayak.com.hk/h/run/" | |
+ "hoteldetails/ajax/overview/rates?hid=49497&searchid="; | |
private static final Map<String, String> NAME_MAP = new HashMap<String, String>(){ | |
private static final long serialVersionUID = 1L; | |
{ | |
this.put("pvd1p2", "kayak"); | |
this.put("pvd2p2", "booking.com"); | |
this.put("pvd3p2", "expedia"); | |
this.put("pvd4p2", "hotels.com"); | |
} | |
}; | |
public static void main(String[] args) throws IOException { | |
Connection connection = null; | |
Response response = null; | |
String[] substrings = null; | |
Document document = null; | |
Map<String, String> result = new HashMap<>(); | |
//先解析基础页面,以找出searchId,该值会在一段时间后过期,因此每次都要解析一下 | |
connection = Jsoup.connect(BASE_URL); | |
connection.header("Host", HEADER_HOST); | |
connection.get(); | |
response = connection.response(); | |
connection.userAgent(HEADER_USER_AGENT); | |
String responseBody = response.body(); | |
substrings = responseBody.split(BASE_KEY_STRING); | |
if ((substrings == null) || (substrings.length != 2)) { | |
System.err.println("resolve failed, maybe base page changed. " | |
+ "please contact caoxudong"); | |
} else { | |
String targetStr = substrings[1]; | |
/** | |
* 这里targetStr的内容应该是包含了searchId及其后所有的内容,例如 | |
* ("49497", "IPAEA-fQoi", "overview");});jq(function() {if(typeof R9.RP.Hotels.Details.PrettyUrl != "undefined") {。。。。。。 | |
* | |
* 这里按照英文双引号分割出searchId即可 | |
*/ | |
substrings = targetStr.split("\"", 5); | |
String searchId = substrings[3]; | |
String actualUrl = null; | |
actualUrl = ACTUAL_URL + searchId; | |
connection = Jsoup.connect(actualUrl); | |
connection.userAgent(HEADER_USER_AGENT); | |
connection.header("Host", HEADER_HOST); | |
document = connection.get(); | |
Elements elements = document.select("tbody.providerEntry"); | |
for (int i=0; i<elements.size(); i++) { | |
Element element = elements.get(i); | |
String id = element.id(); | |
Elements aTags = element.select("span.showRoomsLink a"); | |
if ((aTags == null) || (aTags.size() != 1)) { | |
System.err.println("resolve failed, maybe base page changed. " | |
+ "please contact caoxudong"); | |
break; | |
} else { | |
Element aTag = aTags.get(0); | |
String aTagContent = aTag.text(); //e.g. "show 7 more room types" | |
result.put(id, aTagContent.split(" ", 3)[1]); | |
} | |
} | |
} | |
//print result | |
for (Map.Entry<String, String> entry: result.entrySet()) { | |
System.out.println(NAME_MAP.get(entry.getKey()) + ": " + entry.getValue()); | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment