Last active
August 10, 2023 05:47
-
-
Save epoz/0c994423101945446938e9a98bf1588e to your computer and use it in GitHub Desktop.
OAI Downloader
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import httpx, os | |
import xml.etree.ElementTree as ET | |
OUTPATH = "./harvest" | |
def parse(result:str): | |
doc = ET.fromstring(result) | |
buf = doc.findall('.//{http://www.openarchives.org/OAI/2.0/}record') | |
token = doc.find('.//{http://www.openarchives.org/OAI/2.0/}resumptionToken') | |
if token is not None: | |
token = token.text | |
return buf, token | |
def harvest(uri:str, metadata:str): | |
if not os.path.exists(OUTPATH): | |
os.mkdir(OUTPATH) | |
r = httpx.get(f'{uri}?verb=ListRecords&metadataPrefix={metadata}', timeout=60) | |
if r.status_code != 200: | |
raise Exception(f"{r.status_code} {r.text}") | |
buf, token = parse(r.text) | |
while token: | |
for record in buf: | |
i = record.find('.//{http://www.openarchives.org/OAI/2.0/}identifier').text | |
open(os.path.join(OUTPATH, i), "wb").write(ET.tostring(record)) | |
try: | |
r = httpx.get(f'{uri}?verb=ListRecords&resumptionToken={token}', timeout=60) | |
except: | |
traceback.print_exc() | |
break | |
if r.status_code != 200: | |
print(r.text, r.status_code, "Error!") | |
break | |
buf, token = parse(r.text) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment