Created
December 5, 2016 17:13
-
-
Save mjlavin80/2e37e5a22a80a06fb7fe6324de88fac4 to your computer and use it in GitHub Desktop.
Loop through a set of Worldcat ids and download metadata for each
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# This python script will loop through a set of Worldcat ids, download metadata for each id, and store full xml values in sqlite format (datastore.db) for later parsing. | |
# If the daily key limit is reached, the script will terminate and, the next time you run it, the script will look for Worldcat ids in the database and skip them if present. | |
# Therefore, the intended way to run this script is as a daily cron job until data is downloaded for every id. | |
#Worldcat ids go here in list format, like this: ids_list = [11111, 22222, 33333] | |
ids_list = [] | |
#replace 'Your key here' with API key | |
KEY = 'Your key here' | |
import sqlite3 | |
conn = sqlite3.connect('datastore.db') | |
c = conn.cursor() | |
c.execute("""CREATE TABLE IF NOT EXISTS raw_data (id INTEGER, xml BLOB)""") | |
def check_data(_id): | |
query = "".join(["SELECT * FROM raw_data WHERE id=", _id]) | |
r = c.execute(query).fetchall() | |
return len(r) | |
def insert_data(_id, xml): | |
query = "".join(["""INSERT INTO raw_data (id, xml) VALUES (""", _id, """, '""", xml, """')"""]) | |
c.execute(query) | |
conn.commit() | |
import requests | |
import urllib.parse | |
import xml.etree.ElementTree as ET | |
import time | |
def worldcat_record(oclc_id, key, format="atom", schema='info%3Asrw%2Fschema%2Fdc'): | |
url = ["http://www.worldcat.org/webservices/catalog/content/", | |
oclc_id, "?wskey=", str(key), "&format=", format, "&recordSchema=", schema] | |
built_url = "".join(url) | |
url_object = requests.get(built_url) | |
return url_object | |
for w_id in ids_list: | |
if check_data(str(w_id)) > 0: | |
pass | |
else: | |
time.sleep(3) | |
url_object = worldcat_record(str(w_id), KEY) | |
if url_object.status_code == requests.codes.ok: | |
xml = url_object.text | |
insert_data(str(w_id), urllib.parse.quote_plus(xml)) | |
else: | |
print(url_object.status_code) | |
break |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment