Skip to content

Instantly share code, notes, and snippets.

@szymonlopaciuk
Last active March 27, 2018 09:42
Show Gist options
  • Save szymonlopaciuk/a9e00c5b4e5eaf1fdfe3ac94266501e8 to your computer and use it in GitHub Desktop.
Save szymonlopaciuk/a9e00c5b4e5eaf1fdfe3ac94266501e8 to your computer and use it in GitHub Desktop.
Bulk ORCID Validate
#!/usr/bin/env python
""""
Bulk download records from legacy, usage: `get_marcxml.py [number_of_records] > [output_marcxml]`
To migrate: `inspirehep migrator populate -f [path_to_marcxml]`
"""
from lxml.etree import fromstring, tostring
import requests
import sys
PAGE_SIZE = 250
URL = "http://inspirehep.net/search?jrec={}&op1=a&op2=a&ln=en&of=xm&as=1&rg={}&m1=a&m3=a&m2=a"
length = int(sys.argv[1])
indexes = list(range(length))[::PAGE_SIZE]
all = None
for idx in indexes:
c_page_size = min(PAGE_SIZE, length - idx)
sys.stderr.write("Fetching batch {}-{}\n".format(idx, idx + c_page_size))
response = requests.get(URL.format(idx, c_page_size))
xml = fromstring(response.text.encode('utf-8'))
if all is None:
all = xml
else:
for elem in xml.getchildren():
all.append(elem)
sys.stderr.write("Downloaded {} records.\n".format(len(all.getchildren())))
print tostring(all)
from lxml import etree
from invenio_records.models import RecordMetadata
from inspirehep.modules.orcid import OrcidConverter
schema = etree.XMLSchema(file='/code/tests/integration/orcid/fixtures/record_2.0/work-2.0.xsd')
jsons = [r.json for r in RecordMetadata.query.all() if r.json['$schema'].endswith('/hep.json')]
errors = []
def validate(r):
converter = OrcidConverter(r, url_pattern='http://inspirehep.net/record/{recid}')
xml = converter.get_xml()
schema.assertValid(xml)
for r in jsons:
try:
validate(r)
except Exception as e:
errors.append((r['control_number'], e, r))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment