Skip to content

Instantly share code, notes, and snippets.

@caoxudong
Last active December 25, 2015 18:59
Show Gist options
  • Save caoxudong/7024227 to your computer and use it in GitHub Desktop.
Save caoxudong/7024227 to your computer and use it in GitHub Desktop.
解析网页获取数据
#!/usr/bin/env python
#coding:utf-8
"""
徐敏需要的一个网页解析工具。
解析网页,从中找出"User also viewed"和"Distance from"数据,打印。
示例网页: http://www.kayak.com/hotels/InterContinental-Hong-Kong,Kowloon,Hong-Kong-c55204-h49498-details/2013-11-01/2013-11-02/2guests/#similar
但是,这sb网页写的真屎,真正的数据是在这个页面 http://www.kayak.com/h/run/hoteldetails/tab?tab=similar&hid=49498&searchid=GLEEDa4RQ_&rsrc=hdetails
"""
import sys
import urllib2
from HTMLParser import HTMLParser
class MyHTMLParser(HTMLParser):
def __init__(self):
HTMLParser.__init__(self)
self.user_also_viewed = []
self.distance_from = []
self.__is_user_also_viewed = False
self.__is_distance_from = False
self.__is_a_with_show_on_map = False
def handle_starttag(self, tag, attrs):
if tag != 'span' and tag != 'a' and tag != 'div':
pass
for name, value in attrs:
name = name.strip()
value = value.strip()
if name == 'class':
if value == 'similarHotelAlsoViewed cell' and tag == 'div':
self.__is_user_also_viewed = True
if value == 'similarHotelDistanceFromText' and tag == 'div':
self.__is_distance_from = True
if value == 'showOnMap' and tag == 'a':
self.__is_a_with_show_on_map = True
def handle_data(self, data):
if self.__is_distance_from and not self.__is_a_with_show_on_map:
self.distance_from.append(data.strip())
if self.__is_user_also_viewed:
data = data.strip()
if len(data) != 0:
self.user_also_viewed.append(data)
def handle_endtag(self, tag):
if tag == 'a' and self.__is_a_with_show_on_map:
self.__is_a_with_show_on_map = False
if tag == 'div' and self.__is_distance_from:
self.__is_distance_from = False
if tag == 'div' and self.__is_user_also_viewed:
self.__is_user_also_viewed = False
if len(sys.argv) < 2:
print u'请输入参数: 参数为酒店id\n'
sys.exit(1)
hotel_id = sys.argv[1]
base_target_url = 'http://www.kayak.com/h/run/hoteldetails/tab?tab=similar&hid=%s&searchid=GLEEDa4RQ_&rsrc=hdetails'
content = urllib2.urlopen(base_target_url % hotel_id).read()
parser = MyHTMLParser()
parser.feed(content)
count_of_distance_from_data = len(parser.distance_from)
count_of_user_also_viewed_data = len(parser.user_also_viewed)
if count_of_distance_from_data != count_of_user_also_viewed_data:
print "wrong data, please contact caoxudong."
else:
for distance_from in parser.distance_from:
print distance_from, '\t',
print ''
for user_also_viewed in parser.user_also_viewed:
print user_also_viewed, '\t',
print ''
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment