karlcow · December 10, 2015 16:28
diff --git a/gethyperel.py b/gethyperel.py
 #!/usr/bin/env python2.7
 # encoding: utf-8

 import sys
 import string
 from pprint import pprint
 from urlparse import urljoin
 from lxml.html import html5parser
 import argparse

 # CONFIG
 HTMLNS = "http://www.w3.org/1999/xhtml"


 def getURI(uri):
    """Given a URI return the parsed document object"""
    doc = html5parser.parse(uri).getroot()
    return doc


 def getarel(doc, uri):
    """return the value of rel for a"""
    hyperdict = []
    # grab all 'a' elements from the document
    alist = doc.xpath('//h:a[@rel]', namespaces={'h': HTMLNS})
    for a in alist:
        rels = a.xpath('@rel')
        # some 'rel' attributes might have a space separated list of values
        for rel in string.split(rels[0]):
            # we add to a list the couple 'rel' and 'href'
            hyperdict.append(dict([
                ('rel', rel),
                ('href', urljoin(uri, a.xpath('@href')[0]))
                ]),
            )
    return hyperdict


 def main():
    parser = argparse.ArgumentParser(description="parse a Web site and search for rel values on a")
    parser.add_argument('uri', help='URI to be processed', action='store', nargs=1)
    args = parser.parse_args()

    uri = args.uri[0]
    document = getURI(uri)
    relations = getarel(document, uri)
    pprint(relations)

 if __name__ == "__main__":
    sys.exit(main())
	#!/usr/bin/env python2.7
	# encoding: utf-8

	import sys
	import string
	from pprint import pprint
	from urlparse import urljoin
	from lxml.html import html5parser
	import argparse

	# CONFIG
	HTMLNS = "http://www.w3.org/1999/xhtml"


	def getURI(uri):
	"""Given a URI return the parsed document object"""
	doc = html5parser.parse(uri).getroot()
	return doc


	def getarel(doc, uri):
	"""return the value of rel for a"""
	hyperdict = []
	# grab all 'a' elements from the document
	alist = doc.xpath('//h:a[@rel]', namespaces={'h': HTMLNS})
	for a in alist:
	rels = a.xpath('@rel')
	# some 'rel' attributes might have a space separated list of values
	for rel in string.split(rels[0]):
	# we add to a list the couple 'rel' and 'href'
	hyperdict.append(dict([
	('rel', rel),
	('href', urljoin(uri, a.xpath('@href')[0]))
	]),
	)
	return hyperdict


	def main():
	parser = argparse.ArgumentParser(description="parse a Web site and search for rel values on a")
	parser.add_argument('uri', help='URI to be processed', action='store', nargs=1)
	args = parser.parse_args()

	uri = args.uri[0]
	document = getURI(uri)
	relations = getarel(document, uri)
	pprint(relations)

	if __name__ == "__main__":
	sys.exit(main())