Skip to content

Instantly share code, notes, and snippets.

Created July 29, 2018 01:19
Show Gist options
  • Save ggorlen/5611f8550a8514305cacd3e1172ff69b to your computer and use it in GitHub Desktop.
Save ggorlen/5611f8550a8514305cacd3e1172ff69b to your computer and use it in GitHub Desktop.
Script to download "shelf" lists from the SFPL Bibliocommons website
import datetime
import sys
from bs4 import BeautifulSoup
from requests import session
# should be a valid URL to the shelf
base_url = sys.argv[1] + "?page="
except IndexError:
base_url = ""
output_filename = "bibliocommons_shelf_" + str( + ".txt"
shelf_contents = ""
# attempts to fix UnicodeEncodeError: character maps to <undefined>
def fix_enc(s):
return str(s).encode(sys.stdout.encoding, errors='backslashreplace').decode(sys.stdout.encoding)
# start a new requests session
with session() as c:
# index to store the current page on the website
page_num = 1
# keep checking through pages until we run out
while True:
# navigate to the next page in the shelf
response = c.get(base_url + str(page_num))
if "200" not in str(response):
# move on to the next page
page_num += 1
# make a soup object with the response text
soup = BeautifulSoup(response.text, 'html.parser')
# find title/author/format information
titles = soup.find_all("span", class_="title")
formats = soup.find_all("span", class_="format")
assert len(titles) == len(formats)
# save the first title from the results
current_title = str(titles)
# stop checking pages if current title is same as
# the last, meaning we're at the end of the list
if current_title == last_title:
except NameError:
# iterate over titles size
for n in range(len(titles)):
# print title and store in a string for writing
shelf_contents += titles[n].get_text().strip() + "\n"
# idx for author search
i = 0
# check to see if there's an author for this title
for e in titles[n].next_elements:
if "class=\"author" in str(e):
author = e.get_text().strip()
shelf_contents += author + "\n"
# increment counter and break if search failed
i += 1
if i >= 7:
# append and print format
shelf_contents += formats[n].get_text()\
.replace("\n", "").strip() + "\n"
print(formats[n].get_text().replace("\n", "").strip())
except (IndexError, UnicodeEncodeError) as e:
# add extra carriage return
shelf_contents += "\n"
# make the current title the last title
last_title = current_title
# write to file
with open(output_filename, 'w') as f:
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment