Created
July 29, 2018 01:19
-
-
Save ggorlen/5611f8550a8514305cacd3e1172ff69b to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Script to download "shelf" lists from the SFPL Bibliocommons website | |
http://stackoverflow.com/questions/189555/how-to-use-python-to-login-to-a-webpage-and-retrieve-cookies-for-later-usage | |
http://stackoverflow.com/questions/14630288/unicodeencodeerror-charmap-codec-cant-encode-character-maps-to-undefined | |
""" | |
import datetime | |
import sys | |
from bs4 import BeautifulSoup | |
from requests import session | |
try: | |
# should be a valid URL to the shelf | |
base_url = sys.argv[1] + "?page=" | |
except IndexError: | |
base_url = "https://sfpl.bibliocommons.com/collection/show/378235992/library/for_later?page=" | |
output_filename = "bibliocommons_shelf_" + str(datetime.datetime.now().date()) + ".txt" | |
shelf_contents = "" | |
# attempts to fix UnicodeEncodeError: character maps to <undefined> | |
def fix_enc(s): | |
return str(s).encode(sys.stdout.encoding, errors='backslashreplace').decode(sys.stdout.encoding) | |
# start a new requests session | |
with session() as c: | |
# index to store the current page on the website | |
page_num = 1 | |
# keep checking through pages until we run out | |
while True: | |
# navigate to the next page in the shelf | |
response = c.get(base_url + str(page_num)) | |
if "200" not in str(response): | |
print(response) | |
sys.exit(1) | |
# move on to the next page | |
page_num += 1 | |
# make a soup object with the response text | |
soup = BeautifulSoup(response.text, 'html.parser') | |
# find title/author/format information | |
titles = soup.find_all("span", class_="title") | |
formats = soup.find_all("span", class_="format") | |
assert len(titles) == len(formats) | |
# save the first title from the results | |
current_title = str(titles) | |
# stop checking pages if current title is same as | |
# the last, meaning we're at the end of the list | |
try: | |
if current_title == last_title: | |
break | |
except NameError: | |
pass | |
# iterate over titles size | |
for n in range(len(titles)): | |
try: | |
# print title and store in a string for writing | |
shelf_contents += titles[n].get_text().strip() + "\n" | |
print(titles[n].get_text().strip()) | |
# idx for author search | |
i = 0 | |
# check to see if there's an author for this title | |
for e in titles[n].next_elements: | |
if "class=\"author" in str(e): | |
author = e.get_text().strip() | |
shelf_contents += author + "\n" | |
print(author) | |
break | |
# increment counter and break if search failed | |
i += 1 | |
if i >= 7: | |
break | |
# append and print format | |
shelf_contents += formats[n].get_text()\ | |
.replace("\n", "").strip() + "\n" | |
print(formats[n].get_text().replace("\n", "").strip()) | |
except (IndexError, UnicodeEncodeError) as e: | |
pass | |
# add extra carriage return | |
shelf_contents += "\n" | |
print() | |
# make the current title the last title | |
last_title = current_title | |
# write to file | |
with open(output_filename, 'w') as f: | |
f.write(fix_enc(shelf_contents)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment