Last active
December 29, 2023 19:48
-
-
Save bencrowder/5360985 to your computer and use it in GitHub Desktop.
Small Python script to scrape LDS General Conference transcripts and output HTML page listing scripture references. Example: http://bencrowder.net/files/gc-references/2013-04
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
# -*- coding: utf-8 -*- | |
import re | |
import codecs | |
import requests | |
import bs4 | |
# Change these | |
year = 2013 | |
month = 4 | |
limit = None | |
class ConferenceSession: | |
talks = [] | |
references = {} | |
urls = {} | |
# List of book names in the scriptures, used for sorting | |
book_names = { | |
'old_testament': [ 'Genesis', 'Exodus', 'Leviticus', 'Numbers', 'Deuteronomy', 'Joshua', 'Judges', 'Ruth', '1 Samuel', '2 Samuel', '1 Kings', '2 Kings', '1 Chronicles', '2 Chronicles', 'Ezra', 'Nehemiah', 'Esther', 'Job', 'Psalms', 'Psalm', 'Proverbs', 'Ecclesiastes', 'Song of Solomon', 'Isaiah', 'Jeremiah', 'Lamentations', 'Ezekiel', 'Daniel', 'Hosea', 'Joel', 'Amos', 'Obadiah', 'Jonah', 'Micah', 'Nahum', 'Habakkuk', 'Zephaniah', 'Haggai', 'Zechariah', 'Malachi' ], | |
'new_testament': [ 'Matthew', 'Mark', 'Luke', 'John', 'Acts', 'Romans', '1 Corinthians', '2 Corinthians', 'Galatians', 'Ephesians', 'Philippians', 'Colossians', '1 Thessalonians', '2 Thessalonians', '1 Timothy', '2 Timothy', 'Titus', 'Philemon', 'Hebrews', 'James', '1 Peter', '2 Peter', '1 John', '2 John', '3 John', 'Jude', 'Revelation' ], | |
'book_of_mormon': [ '1 Nephi', '2 Nephi', 'Jacob', 'Enos', 'Jarom', 'Omni', 'Words of Mormon', 'Mosiah', 'Alma', 'Helaman', '3 Nephi', '4 Nephi', 'Mormon', 'Ether', 'Moroni' ], | |
'doctrine_and_covenants': [ 'D&C' ], | |
'pearl_of_great_price': [ 'Moses', 'Abraham', 'Joseph Smith—Matthew', 'Joseph Smith—History', 'Articles of Faith' ] | |
} | |
sorted_references = { | |
'old_testament': [], | |
'new_testament': [], | |
'book_of_mormon': [], | |
'doctrine_and_covenants': [], | |
'pearl_of_great_price': [], | |
'other': [] | |
} | |
def __init__(self, year, month, limit=None): | |
self.year = year | |
self.month = month | |
# Get the talks | |
print 'Getting the talks...' | |
self.get_talks(year, month, limit) | |
# Get the references | |
print 'Getting references for each talk...' | |
self.get_references() | |
# Sort by verse #, chapter, and then book | |
print 'Sorting...' | |
self.sorted_list = self.references.iterkeys() | |
self.sorted_list = sorted(self.sorted_list, key=self.sort_by_verse) | |
self.sorted_list = sorted(self.sorted_list, key=self.sort_by_chapter) | |
self.sorted_list = sorted(self.sorted_list, key=self.sort_by_book) | |
# Sort into books, populates self.sorted_references | |
self.sort_into_books() | |
# Save the list to disk | |
print 'Saving to disk...' | |
self.save() | |
# Download the talks for a given conference session | |
def get_talks(self, year, month, limit=None): | |
url = 'http://www.lds.org/general-conference/sessions/%04d/%02d' % (year, month) | |
# Slurp in the HTML | |
r = requests.get(url) | |
soup = bs4.BeautifulSoup(r.content) | |
# Get all <span class="talk"> | |
talks = soup.find_all("span", "talk", limit=limit) | |
self.talks = [] | |
for talk in talks: | |
if talk.a: | |
title, url = talk.a.contents[0], talk.a['href'] | |
speaker = talk.parent.find("span", "speaker").contents[0] | |
self.talks.append({'title': title, 'url': url, 'speaker': speaker}) | |
# Get references for a given talk | |
def get_refs_for_talk(self, url): | |
r = requests.get(url) | |
soup = bs4.BeautifulSoup(r.content) | |
# Get all <a class="scriptureRef"> | |
refs = soup.find_all("a", "scriptureRef") | |
response = [] | |
for ref in refs: | |
title = ref.contents[0].strip() | |
ref_url = ref['href'] | |
# Check to see if the title starts with a verse number | |
if re.search('^\d{1,3}:', title) and re.search('scriptures/dc-testament', ref_url): | |
title = 'D&C %s' % title | |
title = re.sub(r'Doctrine and Covenants', 'D&C', title) | |
# Replace non-breaking spaces with normal spaces | |
title = title.replace(u"\u00A0", " ") | |
response.append({'title': title, 'url': ref_url}) | |
return response | |
# Go through the talks and get references for each | |
def get_references(self): | |
for talk in self.talks: | |
talk['references'] = self.get_refs_for_talk(talk['url']) | |
# For each reference in the talk | |
for ref in talk['references']: | |
title = ref['title'] | |
# Initialize the array for that reference | |
if title not in self.references: | |
self.references[title] = [] | |
self.urls[title] = ref['url'] | |
# Add the talk and its URL to the list | |
self.references[title].append(talk) | |
# Sort function by verse (after the colon) | |
def sort_by_verse(self, key): | |
m = re.match(r'(.*?) (\d+)(:(\d+))?', key) | |
if m: | |
groups = m.groups() | |
if len(groups) > 2 and groups[3] is not None: | |
return int(groups[3]) | |
else: | |
return 0 | |
else: | |
return 0 | |
# Sort by chapter (just before the colon) | |
def sort_by_chapter(self, key): | |
m = re.match(r'(.*?) (\d+)(:(\d+))?', key) | |
if m: | |
return int(m.groups()[1]) | |
else: | |
return 0 | |
# Sort by book name | |
def sort_by_book(self, key): | |
# First get the book name (first part of the reference) | |
m = re.match(r'(.*?) (\d+)', key) | |
val = 0 | |
if m: | |
book_name = m.groups()[0].encode('utf-8') | |
# Now we want to use the index from our book name list as the sort key, to put things in order | |
if book_name in self.book_names['old_testament']: | |
list_name = 'old_testament' | |
elif book_name in self.book_names['new_testament']: | |
list_name = 'new_testament' | |
elif book_name in self.book_names['book_of_mormon']: | |
list_name = 'book_of_mormon' | |
elif book_name in self.book_names['doctrine_and_covenants']: | |
list_name = 'doctrine_and_covenants' | |
elif book_name in self.book_names['pearl_of_great_price']: | |
list_name = 'pearl_of_great_price' | |
else: | |
list_name = 'other' | |
if list_name != 'other': | |
val = self.book_names[list_name].index(book_name) | |
return val | |
# Sort self.sorted_list out by book (populates self.sorted_references) | |
def sort_into_books(self): | |
for ref in self.sorted_list: | |
# Get the book name | |
m = re.match(r'(.*?) (\d+)', ref) | |
if m == None: | |
self.sorted_references['other'].append(ref) | |
else: | |
book = m.groups()[0].encode('utf-8') | |
if book in self.book_names['old_testament']: | |
self.sorted_references['old_testament'].append(ref) | |
elif book in self.book_names['new_testament']: | |
self.sorted_references['new_testament'].append(ref) | |
elif book in self.book_names['book_of_mormon']: | |
self.sorted_references['book_of_mormon'].append(ref) | |
elif book in self.book_names['doctrine_and_covenants']: | |
self.sorted_references['doctrine_and_covenants'].append(ref) | |
elif book in self.book_names['pearl_of_great_price']: | |
self.sorted_references['pearl_of_great_price'].append(ref) | |
else: | |
self.sorted_references['other'].append(ref) | |
# Saves a single volume | |
def print_list(self, book): | |
for ref in self.sorted_references[book]: | |
talks = self.references[ref] | |
url = self.urls[ref] | |
self.handle.write('<li>\n\t<label><a href="%s">%s</a></label>\n\t<ul class="refs">\n' % (url, ref)) | |
for talk in talks: | |
self.handle.write('\t\t<li><a href="%s">%s</a></li>\n' % (talk['url'], talk['title'])) | |
self.handle.write('\t</ul>\n</li>\n') | |
# Counts a single volume | |
def count_list(self, book): | |
return len(self.sorted_references[book]) | |
# Save the whole list | |
def save(self): | |
if self.month == 4: | |
month_name = 'April' | |
elif self.month == 10: | |
month_name = 'October' | |
# And write it out to the file | |
f = codecs.open('output.html', 'w', 'utf-8') | |
self.handle = f | |
f.write('<html>\n') | |
f.write('<head>\n') | |
f.write('\t<meta charset="utf-8">\n') | |
f.write('\t<title>%s %s General Conference Scripture References</title>\n' % (month_name, year)) | |
f.write('\t<style type="text/css">\n') | |
f.write('\t\t* { -moz-box-sizing: border-box; box-sizing: border-box; }\n') | |
f.write('\t\ta { color: #5591ce; text-decoration: none; }\n') | |
f.write('\t\ta:hover { text-decoration: underline; }\n') | |
f.write('\t\tbody { margin: 0; padding: 0; font-family: Helvetica, Arial, sans-serif; }\n') | |
f.write('\t\t#page { max-width: 800px; width: 95%; margin: 50px auto; }\n') | |
f.write('\t\t#page h1 { font-size: 1.8em; }\n') | |
f.write('\t\t#page h2 { font-size: 1.6em; margin: 2em 0 .5em; }\n') | |
f.write('\t\t#page > ul { list-style: none; margin: 0; padding: 0; line-height: 1.5em; }\n') | |
f.write('\t\t#page > ul > li { border-bottom: solid 1px #ddd; padding: 5px 0; overflow: auto; clear: both; }\n') | |
f.write('\t\t#page > ul label { font-weight: bold; font-size: 1.2em; width: 50%; float: left; }\n') | |
f.write('\t\t#page > ul ul.refs { margin: 0; float: left; padding: 0; list-style: none; }\n') | |
f.write('\t\t#page > ul.toc > li { border: none; display: inline-block; }\n') | |
f.write('\t\t#page > ul.toc > li + li:before { content: " -- "; color: #ccc; }\n') | |
f.write('\t\t@media screen and (max-width: 750px) {\n') | |
f.write('\t\t\t#page { margin: 15px auto; }\n') | |
f.write('\t\t\t#page > ul label { float: none; }\n') | |
f.write('\t\t\t#page > ul ul.refs { float: none; }\n') | |
f.write('\t\t}\n') | |
f.write('\t</style>\n') | |
f.write('</head>\n') | |
f.write('<body>\n') | |
f.write('<section id="page">\n') | |
f.write('\t<h1>%s %s General Conference Scripture References</h1>\n\n' % (month_name, year)) | |
f.write('\t<ul class="toc">\n') | |
f.write('\t\t<li><a href="#old-testament">Old Testament</a></li>\n') | |
f.write('\t\t<li><a href="#new-testament">New Testament</a></li>\n') | |
f.write('\t\t<li><a href="#book-of-mormon">Book of Mormon</a></li>\n') | |
f.write('\t\t<li><a href="#doctrine-and-covenants">Doctrine and Covenants</a></li>\n') | |
f.write('\t\t<li><a href="#pearl-of-great-price">Pearl of Great Price</a></li>\n') | |
f.write('\t</ul>\n\n') | |
f.write('\t<h2 id="old-testament">Old Testament</h2>\n') | |
f.write('\t<ul>\n') | |
self.print_list('old_testament') | |
f.write('\t</ul>\n') | |
f.write('\t<h2 id="new-testament">New Testament</h2>\n') | |
f.write('\t<ul>\n') | |
self.print_list('new_testament') | |
f.write('\t</ul>\n') | |
f.write('\t<h2 id="book-of-mormon">Book of Mormon</h2>\n') | |
f.write('\t<ul>\n') | |
self.print_list('book_of_mormon') | |
f.write('\t</ul>\n') | |
f.write('\t<h2 id="doctrine-and-covenants">Doctrine and Covenants</h2>\n') | |
f.write('\t<ul>\n') | |
self.print_list('doctrine_and_covenants') | |
f.write('\t</ul>\n') | |
f.write('\t<h2 id="pearl-of-great-price">Pearl of Great Price</h2>\n') | |
f.write('\t<ul>\n') | |
self.print_list('pearl_of_great_price') | |
f.write('\t</ul>\n') | |
f.write('\t<h2>Other</h2>\n') | |
f.write('\t<ul>\n') | |
self.print_list('other') | |
f.write('\t</ul>\n') | |
f.write('</section>\n') | |
f.write('</body>\n') | |
f.write('</html>\n') | |
f.close() | |
if __name__ == '__main__': | |
session = ConferenceSession(year, month, limit) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment