Last active
December 25, 2015 18:59
-
-
Save caoxudong/7024227 to your computer and use it in GitHub Desktop.
解析网页获取数据
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
#coding:utf-8 | |
""" | |
徐敏需要的一个网页解析工具。 | |
解析网页,从中找出"User also viewed"和"Distance from"数据,打印。 | |
示例网页: http://www.kayak.com/hotels/InterContinental-Hong-Kong,Kowloon,Hong-Kong-c55204-h49498-details/2013-11-01/2013-11-02/2guests/#similar | |
但是,这sb网页写的真屎,真正的数据是在这个页面 http://www.kayak.com/h/run/hoteldetails/tab?tab=similar&hid=49498&searchid=GLEEDa4RQ_&rsrc=hdetails | |
""" | |
import sys | |
import urllib2 | |
from HTMLParser import HTMLParser | |
class MyHTMLParser(HTMLParser): | |
def __init__(self): | |
HTMLParser.__init__(self) | |
self.user_also_viewed = [] | |
self.distance_from = [] | |
self.__is_user_also_viewed = False | |
self.__is_distance_from = False | |
self.__is_a_with_show_on_map = False | |
def handle_starttag(self, tag, attrs): | |
if tag != 'span' and tag != 'a' and tag != 'div': | |
pass | |
for name, value in attrs: | |
name = name.strip() | |
value = value.strip() | |
if name == 'class': | |
if value == 'similarHotelAlsoViewed cell' and tag == 'div': | |
self.__is_user_also_viewed = True | |
if value == 'similarHotelDistanceFromText' and tag == 'div': | |
self.__is_distance_from = True | |
if value == 'showOnMap' and tag == 'a': | |
self.__is_a_with_show_on_map = True | |
def handle_data(self, data): | |
if self.__is_distance_from and not self.__is_a_with_show_on_map: | |
self.distance_from.append(data.strip()) | |
if self.__is_user_also_viewed: | |
data = data.strip() | |
if len(data) != 0: | |
self.user_also_viewed.append(data) | |
def handle_endtag(self, tag): | |
if tag == 'a' and self.__is_a_with_show_on_map: | |
self.__is_a_with_show_on_map = False | |
if tag == 'div' and self.__is_distance_from: | |
self.__is_distance_from = False | |
if tag == 'div' and self.__is_user_also_viewed: | |
self.__is_user_also_viewed = False | |
if len(sys.argv) < 2: | |
print u'请输入参数: 参数为酒店id\n' | |
sys.exit(1) | |
hotel_id = sys.argv[1] | |
base_target_url = 'http://www.kayak.com/h/run/hoteldetails/tab?tab=similar&hid=%s&searchid=GLEEDa4RQ_&rsrc=hdetails' | |
content = urllib2.urlopen(base_target_url % hotel_id).read() | |
parser = MyHTMLParser() | |
parser.feed(content) | |
count_of_distance_from_data = len(parser.distance_from) | |
count_of_user_also_viewed_data = len(parser.user_also_viewed) | |
if count_of_distance_from_data != count_of_user_also_viewed_data: | |
print "wrong data, please contact caoxudong." | |
else: | |
for distance_from in parser.distance_from: | |
print distance_from, '\t', | |
print '' | |
for user_also_viewed in parser.user_also_viewed: | |
print user_also_viewed, '\t', | |
print '' | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment