Skip to content

Instantly share code, notes, and snippets.

@karlcow
Last active December 10, 2015 16:28
Show Gist options
  • Save karlcow/4460798 to your computer and use it in GitHub Desktop.
Save karlcow/4460798 to your computer and use it in GitHub Desktop.
Prototype for demonstrating the collection of link and rel in the case of hypermedia APIs.
#!/usr/bin/env python2.7
# encoding: utf-8
import sys
import string
from pprint import pprint
from urlparse import urljoin
from lxml.html import html5parser
import argparse
# CONFIG
HTMLNS = "http://www.w3.org/1999/xhtml"
def getURI(uri):
"""Given a URI return the parsed document object"""
doc = html5parser.parse(uri).getroot()
return doc
def getarel(doc, uri):
"""return the value of rel for a"""
hyperdict = []
# grab all 'a' elements from the document
alist = doc.xpath('//h:a[@rel]', namespaces={'h': HTMLNS})
for a in alist:
rels = a.xpath('@rel')
# some 'rel' attributes might have a space separated list of values
for rel in string.split(rels[0]):
# we add to a list the couple 'rel' and 'href'
hyperdict.append(dict([
('rel', rel),
('href', urljoin(uri, a.xpath('@href')[0]))
]),
)
return hyperdict
def main():
parser = argparse.ArgumentParser(description="parse a Web site and search for rel values on a")
parser.add_argument('uri', help='URI to be processed', action='store', nargs=1)
args = parser.parse_args()
uri = args.uri[0]
document = getURI(uri)
relations = getarel(document, uri)
pprint(relations)
if __name__ == "__main__":
sys.exit(main())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment