Skip to content

Instantly share code, notes, and snippets.

@lawlesst
Created October 28, 2011 20:57
Show Gist options
  • Save lawlesst/1323535 to your computer and use it in GitHub Desktop.
Save lawlesst/1323535 to your computer and use it in GitHub Desktop.
OAI-PMH harvesting with Python
"""
Harvest MARC records via OAI-PMH.
"""
#Mostly from - http://code.google.com/p/oldmapsonline/source/browse/trunk/oai-pmh/oaipmh-client-pyoai-pymarc.py
# MarcXML reader - parsing done by pymarc
#Handle utf-8 strings
import codecs, sys
reload(sys)
sys.setdefaultencoding('utf-8')
from oaipmh.client import Client
from oaipmh.metadata import MetadataRegistry
from lxml import etree
from cStringIO import StringIO
from lxml.etree import tostring
from pymarc import marcxml, MARCWriter, field
import sys
class MARCXMLReader(object):
"""Returns the PyMARC record from the OAI structure for MARC XML"""
def __call__(self, element):
#print element[0][1].text
handler = marcxml.XmlHandler()
marcxml.parse_xml(StringIO(tostring(element[0], encoding='UTF-8')), handler)
return handler.records[0]
marcxml_reader = MARCXMLReader()
# Defining of metadata Readers in the Registry
from oaipmh import metadata
registry = metadata.MetadataRegistry()
registry.registerReader('marc21', marcxml_reader)
#### OAI-PMH Client processing
oai = Client('http://quod.lib.umich.edu/cgi/o/oai/oai', registry)
recs = oai.listRecords(metadataPrefix='marc21',
set='hathitrust:pdus')
print>>sys.stderr, 'beginning harvest'
for count, rec in enumerate(recs):
id = rec[0].identifier()
if count % 10000 == 0:
if count > 1: break
writer = MARCWriter(file('marc_recs/hathi_%s.mrc' % str(count),'w'))
print>>sys.stderr, "Opening new handle."
print>>sys.stderr, count, id
#try:
record = rec[1] # Get pyMARC representation
#Pass empty records.
if not record:
continue
try:
record.to_unicode = True
record.force_utf8 = True
id_field = field.Field(tag='907',
indicators = ['0', '1'],
subfields=[
'a', id.lstrip('oai:quod.lib.umich.edu:'),
])
record.add_field(id_field)
#print record
writer.write(record)
except Exception, e:
print>>sys.stderr, count, id
print>>sys.stderr, e
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment