Skip to content

Instantly share code, notes, and snippets.

@huangdongxu
Created December 23, 2011 15:44
Show Gist options
  • Save huangdongxu/1514504 to your computer and use it in GitHub Desktop.
Save huangdongxu/1514504 to your computer and use it in GitHub Desktop.
A simple cli tool for html parsing
#!/usr/bin/env python
#coding=utf-8
import os
import sys
import urllib
import urllib2
from BeautifulSoup import BeautifulSoup
import logging
import urlparse
logger = logging.getLogger('console')
logger.setLevel(logging.DEBUG)
logger.addHandler(logging.StreamHandler())
TYPE_URL = 0
TYPE_FILE = 1
#to console
def output(str):
print str.encode('utf-8')
def get_fp(filename):
if filename == '-':
return sys.stdin
fp = open(filename)
return fp
#too simple...sometime naive
def is_valid_url(url):
if url.startswith('http://') or url.startswith('https://'):
return True
else:
return False
def parse(url, rules , type = TYPE_URL):
# read page content
if type == TYPE_URL:
fp = urllib.urlopen(url)
else:
fp = get_fp(url)
page_content = fp.read()
# parse page content using beautiful soup
soup = BeautifulSoup(page_content)
for rule in rules:
try:
# example: a.href -> <a href="..."> div.id -> <div id="...">
node, attr = rule.split('.')
except:
continue
for element in soup(node):
# only for element whi ch has the specific attribute
if element != None and attr in [k for k, _ in element.attrs]:
value = element[attr]
if value.startswith('javascript:'): continue
# dealing <a ...></a> and <img src='...' />
if node == 'a' or node == 'img':
if value.startswith('/'):
url_compont = urlparse.urlparse(url)
value = url_compont.scheme + "://" + url_compont.netloc + value
output(value)
elif is_valid_url(value) == False:
url_compont = urlparse.urlparse(url)
path = '/'.join(url_compont.path.split('/')[:-1])
value = url_compont.scheme + "://" + url_compont.netloc + path + '/' + value
output(value)
else:
output(value)
else:
output(value)
if __name__ == '__main__':
if len(sys.argv) < 3:
print 'usage: ' + sys.argv[0] +' <url> <rules1> [rules2] ...'
print 'example: ' + sys.argv[0] + ' http://g.cn img.src'
print sys.argv[0] + ' http://g.cn a.href img.src'
exit(0)
is_url = True
if sys.argv[1] == '-f':
is_url = False
url = sys.argv[2]
rules = sys.argv[3:]
else:
is_url = True
url = sys.argv[1]
rules = sys.argv[2:]
if is_url == True and is_valid_url(url) == True:
parse(url, rules)
elif is_url == False:
parse(url, rules, TYPE_FILE)
#print 'url is not valid'
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment