szymonlopaciuk · March 27, 2018 09:42
diff --git a/get_marcxml.py b/get_marcxml.py
 #!/usr/bin/env python
 """"
 Bulk download records from legacy, usage: `get_marcxml.py [number_of_records] > [output_marcxml]`
 To migrate: `inspirehep migrator populate -f [path_to_marcxml]`
 """

 from lxml.etree import fromstring, tostring
 import requests
 import sys

 PAGE_SIZE = 250
 URL = "http://inspirehep.net/search?jrec={}&op1=a&op2=a&ln=en&of=xm&as=1&rg={}&m1=a&m3=a&m2=a"

 length = int(sys.argv[1])
 indexes = list(range(length))[::PAGE_SIZE]

 all = None

 for idx in indexes:
 	c_page_size = min(PAGE_SIZE, length - idx)
 	sys.stderr.write("Fetching batch {}-{}\n".format(idx, idx + c_page_size))
 	response = requests.get(URL.format(idx, c_page_size))
 	xml = fromstring(response.text.encode('utf-8'))

 	if all is None:
 		all = xml
 	else:
 		for elem in xml.getchildren():
 			all.append(elem)

 sys.stderr.write("Downloaded {} records.\n".format(len(all.getchildren())))
 print tostring(all)
diff --git a/inspirehep_shell.py b/inspirehep_shell.py
 from lxml import etree
 from invenio_records.models import RecordMetadata
 from inspirehep.modules.orcid import OrcidConverter

 schema = etree.XMLSchema(file='/code/tests/integration/orcid/fixtures/record_2.0/work-2.0.xsd')

 jsons = [r.json for r in RecordMetadata.query.all() if r.json['$schema'].endswith('/hep.json')]

 errors = []                                                                      

 def validate(r):
    converter = OrcidConverter(r, url_pattern='http://inspirehep.net/record/{recid}')
    xml = converter.get_xml()
    schema.assertValid(xml)

 for r in jsons:
    try:
        validate(r)
    except Exception as e:
        errors.append((r['control_number'], e, r))
	#!/usr/bin/env python
	""""
	Bulk download records from legacy, usage: `get_marcxml.py [number_of_records] > [output_marcxml]`
	To migrate: `inspirehep migrator populate -f [path_to_marcxml]`
	"""

	from lxml.etree import fromstring, tostring
	import requests
	import sys

	PAGE_SIZE = 250
	URL = "http://inspirehep.net/search?jrec={}&op1=a&op2=a&ln=en&of=xm&as=1&rg={}&m1=a&m3=a&m2=a"

	length = int(sys.argv[1])
	indexes = list(range(length))[::PAGE_SIZE]

	all = None

	for idx in indexes:
	c_page_size = min(PAGE_SIZE, length - idx)
	sys.stderr.write("Fetching batch {}-{}\n".format(idx, idx + c_page_size))
	response = requests.get(URL.format(idx, c_page_size))
	xml = fromstring(response.text.encode('utf-8'))

	if all is None:
	all = xml
	else:
	for elem in xml.getchildren():
	all.append(elem)

	sys.stderr.write("Downloaded {} records.\n".format(len(all.getchildren())))
	print tostring(all)
	from lxml import etree
	from invenio_records.models import RecordMetadata
	from inspirehep.modules.orcid import OrcidConverter

	schema = etree.XMLSchema(file='/code/tests/integration/orcid/fixtures/record_2.0/work-2.0.xsd')

	jsons = [r.json for r in RecordMetadata.query.all() if r.json['$schema'].endswith('/hep.json')]

	errors = []

	def validate(r):
	converter = OrcidConverter(r, url_pattern='http://inspirehep.net/record/{recid}')
	xml = converter.get_xml()
	schema.assertValid(xml)

	for r in jsons:
	try:
	validate(r)
	except Exception as e:
	errors.append((r['control_number'], e, r))