Skip to content

Instantly share code, notes, and snippets.

@caoxudong
Created March 13, 2014 14:22
Show Gist options
  • Save caoxudong/9529439 to your computer and use it in GitHub Desktop.
Save caoxudong/9529439 to your computer and use it in GitHub Desktop.
解析kayak网站上酒店与市中心的距离
package xumin;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
/**
*
* 解析目标网页,获取酒店距离市中心的距离
*
* 目标网页: http://www.kayak.com.hk/hotels/Hong-Kong,Hong-Kong-c23190/2014-04-01/
* 2014-04-02/2guests?pn={pageNumber}
*
* 该程序有两个可选参数,分别是startPageNumber和pagesCount,如果设置了多个参数,只读取前两个:
* <ol>
* <li>startPageNumber: 从第几页开始获取,默认为0</li>
* <li>pagesCount: 总共要获取几页数据,默认为1</li>
* </ol>
*
* 为简便起见,这里并没有使用多线程来处理对多个页面的解析,后续可以修改为多线程处理,或者使用多进程的方式处理。
*
* NOTE: 页码数从0开始
*
* @author caoxudong
*
*/
public class FetchDistanceFromHongKong {
private static final String HEADER_USER_AGENT = "Mozilla/5.0 (Windows NT "
+ "6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) "
+ "Chrome/33.0.1750.146 Safari/537.36";
private static final String HEADER_HOST = "www.kayak.com.hk";
public static String BASE_URL = "http://www.kayak.com.hk/hotels/"
+ "Hong-Kong,Hong-Kong-c23190/2014-04-01/2014-04-02/2guests?pn=";
public static int startPageNumber = 0;
public static int pagesCount = 1;
public static void main(String[] args) throws IOException {
//process arguments
if (args != null) {
try {
if (args.length == 1) {
startPageNumber = Integer.parseInt(args[0]);
} else if (args.length == 2){
startPageNumber = Integer.parseInt(args[0]);
pagesCount = Integer.parseInt(args[1]);
if (pagesCount <= 0) {
}
}
} catch (Exception e) {
System.err.println("Invalid arguments. "
+ "They should be numbers, and greater than 0.");
System.exit(1);
}
}
Map<String, String> result = new HashMap<>();
int loopIndex=0;
while (loopIndex < pagesCount) {
Connection connection = Jsoup.connect(BASE_URL + (startPageNumber
+ loopIndex));
connection.userAgent(HEADER_USER_AGENT);
// connection.header("Host", HEADER_HOST);
Document document = connection.get();
Elements hotelCellElements = document.select("div.hotelresult "
+ "div.datacell div.datacelldetailwrapper");
if (hotelCellElements != null) {
for (int i=0; i<hotelCellElements.size(); i++) {
Element hotelCellElement = hotelCellElements.get(i);
String hotelName = null;
String distance = null;
//resolve hotel name
Elements hotelNameElements = hotelCellElement.select(
"div.namecontainer a.hotelresultsname");
if ((hotelNameElements == null)
|| (hotelNameElements.size() != 1)) {
System.err.println("resolve failed, "
+ "maybe base page changed. please contact "
+ "caoxudong");
break;
} else {
Element hotelNameElement = hotelNameElements.get(0);
//e.g. "The Mira Hong Kong"
hotelName = hotelNameElement.text();
}
//resolve distance
Elements distanceElements = hotelCellElement.select(
"div.landmarkDistance");
if ((distanceElements == null)
|| (distanceElements.size() != 1)) {
System.err.println("resolve failed, "
+ "maybe base page changed. please contact "
+ "caoxudong");
break;
} else {
Element distanceElement = distanceElements.get(0);
//e.g. "1.2 mi from Hong Kong (city centre)"
distance = distanceElement.text();
System.out.println(distance);
System.out.println(new String(distance.getBytes(), "UTF-8"));
if (distance != null) {
distance = distance.split("&nbsp;", 2)[0];
}
}
//set result
if ((hotelName != null) && (distance != null)) {
result.put(hotelName, distance);
}
}
}
//print result
System.out.println("Page Number: " + (startPageNumber + loopIndex));
for (Map.Entry<String, String> entry: result.entrySet()) {
System.out.println(entry.getKey() + "\t" + entry.getValue());
}
System.out.println();
loopIndex++;
break;
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment