Created
October 28, 2011 20:57
-
-
Save lawlesst/1323535 to your computer and use it in GitHub Desktop.
OAI-PMH harvesting with Python
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Harvest MARC records via OAI-PMH. | |
""" | |
#Mostly from - http://code.google.com/p/oldmapsonline/source/browse/trunk/oai-pmh/oaipmh-client-pyoai-pymarc.py | |
# MarcXML reader - parsing done by pymarc | |
#Handle utf-8 strings | |
import codecs, sys | |
reload(sys) | |
sys.setdefaultencoding('utf-8') | |
from oaipmh.client import Client | |
from oaipmh.metadata import MetadataRegistry | |
from lxml import etree | |
from cStringIO import StringIO | |
from lxml.etree import tostring | |
from pymarc import marcxml, MARCWriter, field | |
import sys | |
class MARCXMLReader(object): | |
"""Returns the PyMARC record from the OAI structure for MARC XML""" | |
def __call__(self, element): | |
#print element[0][1].text | |
handler = marcxml.XmlHandler() | |
marcxml.parse_xml(StringIO(tostring(element[0], encoding='UTF-8')), handler) | |
return handler.records[0] | |
marcxml_reader = MARCXMLReader() | |
# Defining of metadata Readers in the Registry | |
from oaipmh import metadata | |
registry = metadata.MetadataRegistry() | |
registry.registerReader('marc21', marcxml_reader) | |
#### OAI-PMH Client processing | |
oai = Client('http://quod.lib.umich.edu/cgi/o/oai/oai', registry) | |
recs = oai.listRecords(metadataPrefix='marc21', | |
set='hathitrust:pdus') | |
print>>sys.stderr, 'beginning harvest' | |
for count, rec in enumerate(recs): | |
id = rec[0].identifier() | |
if count % 10000 == 0: | |
if count > 1: break | |
writer = MARCWriter(file('marc_recs/hathi_%s.mrc' % str(count),'w')) | |
print>>sys.stderr, "Opening new handle." | |
print>>sys.stderr, count, id | |
#try: | |
record = rec[1] # Get pyMARC representation | |
#Pass empty records. | |
if not record: | |
continue | |
try: | |
record.to_unicode = True | |
record.force_utf8 = True | |
id_field = field.Field(tag='907', | |
indicators = ['0', '1'], | |
subfields=[ | |
'a', id.lstrip('oai:quod.lib.umich.edu:'), | |
]) | |
record.add_field(id_field) | |
#print record | |
writer.write(record) | |
except Exception, e: | |
print>>sys.stderr, count, id | |
print>>sys.stderr, e |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment