Created
September 23, 2013 06:22
-
-
Save mdaniel/6666997 to your computer and use it in GitHub Desktop.
Outputs a textual description of the provided BBC programme URL (or local file).
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /usr/bin/env python | |
import sys | |
from urllib import urlopen | |
from bs4 import BeautifulSoup | |
def build_xmlns_alias_map( html_tag ): | |
""" | |
:param html_tag: the `html` Tag that contains the `xmlns` attributes | |
:return: a dict whose key is the namespace and whose value is the alias in this document | |
""" | |
result = {} | |
html_attrs = html_tag.attrs | |
for a_name in html_attrs: | |
a_value = html_attrs[ a_name ] | |
if a_name.startswith('xmlns:'): | |
ns_alias = a_name[ len('xmlns:'): ] | |
result[ a_value ] = ns_alias | |
return result | |
def main( argv ): | |
""" | |
Scans the provided HTML document and enumerates the MusicSegment structures found therein. | |
""" | |
XMLNS_DC = 'http://purl.org/dc/terms/' | |
XMLNS_FOAF = 'http://xmlns.com/foaf/0.1/' | |
XMLNS_MO = 'http://purl.org/ontology/mo/' | |
XMLNS_PO = 'http://purl.org/ontology/po/' | |
fh = urlopen( argv[1] ) | |
soup = BeautifulSoup( fh.read() ) | |
fh.close() | |
del fh | |
html = soup.find('html', recursive=False) | |
del soup | |
ns_alias_map = build_xmlns_alias_map( html ) | |
dc_ns = ns_alias_map[ XMLNS_DC ] | |
mo_ns = ns_alias_map[ XMLNS_MO ] | |
po_ns = ns_alias_map[ XMLNS_PO ] | |
foaf_ns = ns_alias_map[ XMLNS_FOAF ] | |
del ns_alias_map | |
DcTitleNS = '%s:title' % dc_ns | |
MusicSegmentNS = '%s:MusicSegment' % po_ns | |
MusicArtistNS = '%s:MusicArtist' % mo_ns | |
FoafNameNS = '%s:name' % foaf_ns | |
segments = html.find( attrs= {'id':'segments'} ) | |
music_segs = segments.find_all( attrs= {'typeof':MusicSegmentNS} ) | |
del segments | |
for seg in music_segs: | |
art = seg.find( attrs= {'typeof': MusicArtistNS} ) | |
art_name = art.find( attrs= {'property': FoafNameNS} ) | |
tit = seg.find( attrs= {'property': DcTitleNS} ) | |
release = seg.find( attrs= {'class': r'release'} ) | |
release_label = None | |
if release is not None: | |
release_label = release.find( attrs= {'class': r'record-label'} ) | |
print('artist = %s' % art.string) | |
print('title = %s' % tit.string) | |
if release_label is not None: | |
print('label = %s' % release_label.string) | |
print('') | |
if __name__ == '__main__': | |
main( sys.argv ) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment