Created
March 13, 2014 14:22
-
-
Save caoxudong/9529439 to your computer and use it in GitHub Desktop.
解析kayak网站上酒店与市中心的距离
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package xumin; | |
import java.io.IOException; | |
import java.util.HashMap; | |
import java.util.Map; | |
import org.jsoup.Connection; | |
import org.jsoup.Jsoup; | |
import org.jsoup.nodes.Document; | |
import org.jsoup.nodes.Element; | |
import org.jsoup.select.Elements; | |
/** | |
* | |
* 解析目标网页,获取酒店距离市中心的距离 | |
* | |
* 目标网页: http://www.kayak.com.hk/hotels/Hong-Kong,Hong-Kong-c23190/2014-04-01/ | |
* 2014-04-02/2guests?pn={pageNumber} | |
* | |
* 该程序有两个可选参数,分别是startPageNumber和pagesCount,如果设置了多个参数,只读取前两个: | |
* <ol> | |
* <li>startPageNumber: 从第几页开始获取,默认为0</li> | |
* <li>pagesCount: 总共要获取几页数据,默认为1</li> | |
* </ol> | |
* | |
* 为简便起见,这里并没有使用多线程来处理对多个页面的解析,后续可以修改为多线程处理,或者使用多进程的方式处理。 | |
* | |
* NOTE: 页码数从0开始 | |
* | |
* @author caoxudong | |
* | |
*/ | |
public class FetchDistanceFromHongKong { | |
private static final String HEADER_USER_AGENT = "Mozilla/5.0 (Windows NT " | |
+ "6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) " | |
+ "Chrome/33.0.1750.146 Safari/537.36"; | |
private static final String HEADER_HOST = "www.kayak.com.hk"; | |
public static String BASE_URL = "http://www.kayak.com.hk/hotels/" | |
+ "Hong-Kong,Hong-Kong-c23190/2014-04-01/2014-04-02/2guests?pn="; | |
public static int startPageNumber = 0; | |
public static int pagesCount = 1; | |
public static void main(String[] args) throws IOException { | |
//process arguments | |
if (args != null) { | |
try { | |
if (args.length == 1) { | |
startPageNumber = Integer.parseInt(args[0]); | |
} else if (args.length == 2){ | |
startPageNumber = Integer.parseInt(args[0]); | |
pagesCount = Integer.parseInt(args[1]); | |
if (pagesCount <= 0) { | |
} | |
} | |
} catch (Exception e) { | |
System.err.println("Invalid arguments. " | |
+ "They should be numbers, and greater than 0."); | |
System.exit(1); | |
} | |
} | |
Map<String, String> result = new HashMap<>(); | |
int loopIndex=0; | |
while (loopIndex < pagesCount) { | |
Connection connection = Jsoup.connect(BASE_URL + (startPageNumber | |
+ loopIndex)); | |
connection.userAgent(HEADER_USER_AGENT); | |
// connection.header("Host", HEADER_HOST); | |
Document document = connection.get(); | |
Elements hotelCellElements = document.select("div.hotelresult " | |
+ "div.datacell div.datacelldetailwrapper"); | |
if (hotelCellElements != null) { | |
for (int i=0; i<hotelCellElements.size(); i++) { | |
Element hotelCellElement = hotelCellElements.get(i); | |
String hotelName = null; | |
String distance = null; | |
//resolve hotel name | |
Elements hotelNameElements = hotelCellElement.select( | |
"div.namecontainer a.hotelresultsname"); | |
if ((hotelNameElements == null) | |
|| (hotelNameElements.size() != 1)) { | |
System.err.println("resolve failed, " | |
+ "maybe base page changed. please contact " | |
+ "caoxudong"); | |
break; | |
} else { | |
Element hotelNameElement = hotelNameElements.get(0); | |
//e.g. "The Mira Hong Kong" | |
hotelName = hotelNameElement.text(); | |
} | |
//resolve distance | |
Elements distanceElements = hotelCellElement.select( | |
"div.landmarkDistance"); | |
if ((distanceElements == null) | |
|| (distanceElements.size() != 1)) { | |
System.err.println("resolve failed, " | |
+ "maybe base page changed. please contact " | |
+ "caoxudong"); | |
break; | |
} else { | |
Element distanceElement = distanceElements.get(0); | |
//e.g. "1.2 mi from Hong Kong (city centre)" | |
distance = distanceElement.text(); | |
System.out.println(distance); | |
System.out.println(new String(distance.getBytes(), "UTF-8")); | |
if (distance != null) { | |
distance = distance.split(" ", 2)[0]; | |
} | |
} | |
//set result | |
if ((hotelName != null) && (distance != null)) { | |
result.put(hotelName, distance); | |
} | |
} | |
} | |
//print result | |
System.out.println("Page Number: " + (startPageNumber + loopIndex)); | |
for (Map.Entry<String, String> entry: result.entrySet()) { | |
System.out.println(entry.getKey() + "\t" + entry.getValue()); | |
} | |
System.out.println(); | |
loopIndex++; | |
break; | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment