Created
December 23, 2011 15:44
-
-
Save huangdongxu/1514504 to your computer and use it in GitHub Desktop.
A simple cli tool for html parsing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
#coding=utf-8 | |
import os | |
import sys | |
import urllib | |
import urllib2 | |
from BeautifulSoup import BeautifulSoup | |
import logging | |
import urlparse | |
logger = logging.getLogger('console') | |
logger.setLevel(logging.DEBUG) | |
logger.addHandler(logging.StreamHandler()) | |
TYPE_URL = 0 | |
TYPE_FILE = 1 | |
#to console | |
def output(str): | |
print str.encode('utf-8') | |
def get_fp(filename): | |
if filename == '-': | |
return sys.stdin | |
fp = open(filename) | |
return fp | |
#too simple...sometime naive | |
def is_valid_url(url): | |
if url.startswith('http://') or url.startswith('https://'): | |
return True | |
else: | |
return False | |
def parse(url, rules , type = TYPE_URL): | |
# read page content | |
if type == TYPE_URL: | |
fp = urllib.urlopen(url) | |
else: | |
fp = get_fp(url) | |
page_content = fp.read() | |
# parse page content using beautiful soup | |
soup = BeautifulSoup(page_content) | |
for rule in rules: | |
try: | |
# example: a.href -> <a href="..."> div.id -> <div id="..."> | |
node, attr = rule.split('.') | |
except: | |
continue | |
for element in soup(node): | |
# only for element whi ch has the specific attribute | |
if element != None and attr in [k for k, _ in element.attrs]: | |
value = element[attr] | |
if value.startswith('javascript:'): continue | |
# dealing <a ...></a> and <img src='...' /> | |
if node == 'a' or node == 'img': | |
if value.startswith('/'): | |
url_compont = urlparse.urlparse(url) | |
value = url_compont.scheme + "://" + url_compont.netloc + value | |
output(value) | |
elif is_valid_url(value) == False: | |
url_compont = urlparse.urlparse(url) | |
path = '/'.join(url_compont.path.split('/')[:-1]) | |
value = url_compont.scheme + "://" + url_compont.netloc + path + '/' + value | |
output(value) | |
else: | |
output(value) | |
else: | |
output(value) | |
if __name__ == '__main__': | |
if len(sys.argv) < 3: | |
print 'usage: ' + sys.argv[0] +' <url> <rules1> [rules2] ...' | |
print 'example: ' + sys.argv[0] + ' http://g.cn img.src' | |
print sys.argv[0] + ' http://g.cn a.href img.src' | |
exit(0) | |
is_url = True | |
if sys.argv[1] == '-f': | |
is_url = False | |
url = sys.argv[2] | |
rules = sys.argv[3:] | |
else: | |
is_url = True | |
url = sys.argv[1] | |
rules = sys.argv[2:] | |
if is_url == True and is_valid_url(url) == True: | |
parse(url, rules) | |
elif is_url == False: | |
parse(url, rules, TYPE_FILE) | |
#print 'url is not valid' |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment