- 
      
 - 
        
Save andreasvc/b3b4189120d84dec8857 to your computer and use it in GitHub Desktop.  
| """Extract metadata from Project Gutenberg RDF catalog into a Python dict. | |
| Based on https://bitbucket.org/c-w/gutenberg/ | |
| >>> md = readmetadata() | |
| >>> md[123] | |
| {'LCC': {'PS'}, | |
| 'author': u'Burroughs, Edgar Rice', | |
| 'authoryearofbirth': 1875, | |
| 'authoryearofdeath': 1950, | |
| 'downloads': 401, | |
| 'formats': {'application/epub+zip': 'http://www.gutenberg.org/ebooks/123.epub.noimages', | |
| 'application/prs.plucker': 'http://www.gutenberg.org/ebooks/123.plucker', | |
| 'application/x-mobipocket-ebook': 'http://www.gutenberg.org/ebooks/123.kindle.noimages', | |
| 'application/x-qioo-ebook': 'http://www.gutenberg.org/ebooks/123.qioo', | |
| 'text/html; charset=iso-8859-1': 'http://www.gutenberg.org/files/123/123-h.zip', | |
| 'text/plain': 'http://www.gutenberg.org/ebooks/123.txt.utf-8', | |
| 'text/plain; charset=us-ascii': 'http://www.gutenberg.org/files/123/123.zip'}, | |
| 'id': 123, | |
| 'language': ['en'], | |
| 'subjects': {'Adventure stories', | |
| 'Earth (Planet) -- Core -- Fiction', | |
| 'Fantasy fiction', | |
| 'Science fiction'}, | |
| 'title': u"At the Earth's Core", | |
| 'type': 'Text'} | |
| """ | |
| import os | |
| import re | |
| import gzip | |
| import tarfile | |
| import urllib | |
| import xml.etree.cElementTree as ElementTree | |
| try: | |
| import cPickle as pickle | |
| except ImportError: | |
| import pickle | |
| PICKLEFILE = '/tmp/md.pickle.gz' # The Python dict produced by this module | |
| RDFFILES = '/tmp/rdf-files.tar.bz2' # The catalog downloaded from Gutenberg | |
| RDFURL = r'http://www.gutenberg.org/cache/epub/feeds/rdf-files.tar.bz2' | |
| META_FIELDS = ('id', 'author', 'title', 'downloads', 'formats', 'type', 'LCC', | |
| 'subjects', 'authoryearofbirth', 'authoryearofdeath', 'language') | |
| NS = dict( | |
| pg='http://www.gutenberg.org/2009/pgterms/', | |
| dc='http://purl.org/dc/terms/', | |
| dcam='http://purl.org/dc/dcam/', | |
| rdf='http://www.w3.org/1999/02/22-rdf-syntax-ns#') | |
| LINEBREAKRE = re.compile(ur'[ \t]*[\n\r]+[ \t]*') | |
| ETEXTRE = re.compile(r''' | |
| e(text|b?ook) | |
| \s* | |
| (\#\s*(?P<etextid_front>\d+) | |
| | | |
| (?P<etextid_back>\d+)\s*\#) | |
| ''', re.IGNORECASE | re.VERBOSE) | |
| def readmetadata(): | |
| """Read/create cached metadata dump of Gutenberg catalog. | |
| Returns: | |
| A dictionary with the following fields: | |
| id (int): Gutenberg identifier of text | |
| author (str): Last name, First name | |
| title (str): title of work | |
| subjects (list of str): list of descriptive subjects; a subject may be | |
| hierarchical, e.g: | |
| 'England -- Social life and customs -- 19th century -- Fiction' | |
| LCC (list of str): a list of two letter Library of Congress | |
| Classifications, e.g., 'PS' | |
| language (list of str): list of two letter language codes. | |
| type (str): 'Text', 'Sound', ... | |
| formats (dict of str, str pairs): keys are MIME types, values are URLs. | |
| download count (int): the number of times this ebook has been | |
| downloaded from the Gutenberg site in the last 30 days. | |
| Fields that are not part of the metadata are set to None. | |
| http://www.gutenberg.org/wiki/Gutenberg:Help_on_Bibliographic_Record_Page | |
| """ | |
| if os.path.exists(PICKLEFILE): | |
| metadata = pickle.load(gzip.open(PICKLEFILE, 'rb')) | |
| else: | |
| metadata = {} | |
| for xml in getrdfdata(): | |
| ebook = xml.find(r'{%(pg)s}ebook' % NS) | |
| if ebook is None: | |
| continue | |
| result = parsemetadata(ebook) | |
| if result is not None: | |
| metadata[result['id']] = result | |
| pickle.dump(metadata, gzip.open(PICKLEFILE, 'wb'), protocol=-1) | |
| return metadata | |
| def getrdfdata(): | |
| """Downloads Project Gutenberg RDF catalog. | |
| Yields: | |
| xml.etree.ElementTree.Element: An etext meta-data definition. | |
| """ | |
| if not os.path.exists(RDFFILES): | |
| _, _ = urllib.urlretrieve(RDFURL, RDFFILES) | |
| with tarfile.open(RDFFILES) as archive: | |
| for tarinfo in archive: | |
| yield ElementTree.parse(archive.extractfile(tarinfo)) | |
| def parsemetadata(ebook): | |
| """Parses an etext meta-data definition to extract fields. | |
| Args: | |
| ebook (xml.etree.ElementTree.Element): An ebook meta-data definition. | |
| """ | |
| result = dict.fromkeys(META_FIELDS) | |
| # get etext no | |
| about = ebook.get('{%(rdf)s}about' % NS) | |
| result['id'] = int(os.path.basename(about)) | |
| # author | |
| creator = ebook.find('.//{%(dc)s}creator' % NS) | |
| if creator is not None: | |
| name = creator.find('.//{%(pg)s}name' % NS) | |
| if name is not None: | |
| result['author'] = safeunicode(name.text, encoding='utf-8') | |
| birth = creator.find('.//{%(pg)s}birthdate' % NS) | |
| if birth is not None: | |
| result['authoryearofbirth'] = int(birth.text) | |
| death = creator.find('.//{%(pg)s}deathdate' % NS) | |
| if death is not None: | |
| result['authoryearofdeath'] = int(death.text) | |
| # title | |
| title = ebook.find('.//{%(dc)s}title' % NS) | |
| if title is not None: | |
| result['title'] = fixsubtitles( | |
| safeunicode(title.text, encoding='utf-8')) | |
| # subject lists | |
| result['subjects'], result['LCC'] = set(), set() | |
| for subject in ebook.findall('.//{%(dc)s}subject' % NS): | |
| res = subject.find('.//{%(dcam)s}memberOf' % NS) | |
| if res is None: | |
| continue | |
| res = res.get('{%(rdf)s}resource' % NS) | |
| value = subject.find('.//{%(rdf)s}value' % NS).text | |
| if res == ('%(dc)sLCSH' % NS): | |
| result['subjects'].add(value) | |
| elif res == ('%(dc)sLCC' % NS): | |
| result['LCC'].add(value) | |
| # formats | |
| result['formats'] = {file.find('{%(dc)s}format//{%(rdf)s}value' % NS).text: | |
| file.get('{%(rdf)s}about' % NS) | |
| for file in ebook.findall('.//{%(pg)s}file' % NS)} | |
| # type | |
| booktype = ebook.find('.//{%(dc)s}type//{%(rdf)s}value' % NS) | |
| if booktype is not None: | |
| result['type'] = booktype.text | |
| # languages | |
| lang = ebook.findall('.//{%(dc)s}language//{%(rdf)s}value' % NS) | |
| result['language'] = [a.text for a in lang] or None | |
| # download count | |
| downloads = ebook.find('.//{%(pg)s}downloads' % NS) | |
| if downloads is not None: | |
| result['downloads'] = int(downloads.text) | |
| return result | |
| def etextno(lines): | |
| """Retrieves the id for an etext. | |
| Args: | |
| lines (iter): The lines of the etext to search. | |
| Returns: | |
| int: The id of the etext. | |
| Raises: | |
| ValueError: If no etext id was found. | |
| Examples: | |
| >>> etextno(['Release Date: March 17, 2004 [EBook #11609]']) | |
| 11609 | |
| >>> etextno(['Release Date: July, 2003 [Etext# 4263]']) | |
| 4263 | |
| >>> etextno(['Release Date: November 29, 2003 [Eook #10335]']) | |
| 10335 | |
| >>> etextno(['December, 1998 [Etext 1576#]']) | |
| 1576 | |
| >>> etextno(['Some lines', 'without', 'Any [Etext] Number']) | |
| Traceback (most recent call last): | |
| ... | |
| ValueError: no etext-id found | |
| """ | |
| for line in lines: | |
| match = ETEXTRE.search(line) | |
| if match is not None: | |
| front_match = match.group('etextid_front') | |
| back_match = match.group('etextid_back') | |
| if front_match is not None: | |
| return int(front_match) | |
| elif back_match is not None: | |
| return int(back_match) | |
| else: | |
| raise ValueError('no regex match (this should never happen') | |
| raise ValueError('no etext-id found') | |
| def fixsubtitles(title): | |
| """Introduce any subtitle with (semi)colons instead of newlines. | |
| The first subtitle is introduced with a colon, the rest with semicolons. | |
| >>> fixsubtitles(u'First Across ...\r\nThe Story of ... \r\n' | |
| ... 'Being an investigation into ...') | |
| u'First Across ...: The Story of ...; Being an investigation into ...'""" | |
| tmp = LINEBREAKRE.sub(': ', title, 1) | |
| return LINEBREAKRE.sub('; ', tmp) | |
| def safeunicode(arg, *args, **kwargs): | |
| """Coerce argument to unicode, if it's not already.""" | |
| return arg if isinstance(arg, unicode) else unicode(arg, *args, **kwargs) | |
| __all__ = ['readmetadata'] | 
Brilliant! :) 👍  Just thought I should say, if you are using python 3, urllib.urlretrieve don't work anymore -- you have to import urllib.request and use urllib.request.urlretrieve instead, and unicode have to be str
Dear Andreas, I do greatly appreciate your taking the time to make this code available. I did struggle with the structure though - your design is a dictionary of dictionaries, which I believe makes it unnecessarily complex to extract its data. Put another way, I can't think of a circumstance where a record's position in the RDF catalog is significant for our purposes. Am I incorrect?
Based on the above assumption I made two minor changes to your code for my own use so that your 'metadata' structure becomes a list of dictionaries rather than a dictionary of dictionaries:
    if os.path.exists(PICKLEFILE):
        metadata = pickle.load(gzip.open(PICKLEFILE, 'rb'))
    else:
        # metadata = {}
        metadata = []
        for xml in getrdfdata():
            ebook = xml.find(r'{%(pg)s}ebook' % NS)
            if ebook is None:
                continue
            result = parsemetadata(ebook)
            if result is not None:
                # metadata[result['id']] = result
                metadata.append(result)
        pickle.dump(metadata, gzip.open(PICKLEFILE, 'wb'), protocol=-1)
    return metadata
With this change, accessing the structure is simpler, e.g.:
for item in metaData:
    print("item is:", item)
Rather than:
for key in list(metaData.keys()):
    print("item is:", metaData[key])
Possibly it's too late to introduce this change if it will break dependent code, though maybe it could be added to a future fork.
Sorry guys, I can not catch what def safeunicode(arg, *args, **kwargs) does. I am getting trouble with unicodes, some texts are not shown properly, is this function helpful to solve this issue? Anyway thank you for the snippet!
Thank you. :-)