Created
December 7, 2018 19:44
-
-
Save reinvented/b4af6c04d3f73fa8f1df178aec5cddf2 to your computer and use it in GitHub Desktop.
Python script to harvest Prince Edward Island Legislative Assembly Hansard PDFs and convert them to ASCII text files
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# You may need to do these: | |
# | |
# sudo pip install lxml | |
# sudo pip install requests | |
# sudo pip install pdfminer | |
from lxml import html | |
import requests | |
import urlparse | |
import urllib | |
import os | |
# List to hold all the PDF files we're going to harvest | |
pdfs = [] | |
# Retrienve the main "Daily Debates" page from the Legislative Assembly website | |
page = requests.get('http://www.assembly.pe.ca/hansard/index.php') | |
tree = html.fromstring(page.content) | |
# The 'sittings' -- i.e. "Spring 2005", etc. -- are found in a <select name="selectsitting"> element. | |
sittings = tree.xpath("//select[@name='selectsitting']/option") | |
# Make a list of the sitting codes -- the values we can pass as the | |
# 'selectsitting' parameter to select dates for a given sitting | |
sittings_codes = [option.attrib['value'] for option in sittings] | |
# Make a list of the sitting names, like "Spring 2005" | |
sittings_names= [option.text for option in sittings] | |
# Iterate through each sitting | |
for i, sitting_code in enumerate(sittings_codes): | |
# Retrieve the calendar for the sitting | |
page = requests.get('http://www.assembly.pe.ca/hansard/index.php?selectsitting=' + sitting_code + '&action=Go') | |
tree = html.fromstring(page.content) | |
# Get all the sitting days in this setting by looking for URLs like http://www.assembly.pe.ca/archives/index.php?file=20161115&number=2&year=2016 | |
for elt in tree.xpath("//a[contains(@href,'assembly.pe.ca/sittings/')]"): | |
filename = elt.attrib['href'].split('/')[-1] | |
sitting = elt.attrib['href'].split('/')[-3] | |
sitting_directory = 'documents/' + sitting | |
if not os.path.exists(sitting_directory): | |
os.makedirs(sitting_directory) | |
urllib.urlretrieve (elt.attrib['href'], sitting_directory + '/' + filename) | |
os.system(("pdftotext -raw -enc UTF-8 %s") % sitting_directory + '/' + filename) | |
os.remove(sitting_directory + '/' + filename) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment