Last active
December 16, 2015 15:19
-
-
Save scjody/5454682 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env ruby | |
# coding: utf-8 | |
# @todo Properly case the organization name. | |
# @todo Handle case where the organization has >1 contact but no email | |
# address on the first contact, e.g. CSSS DE CHICOUTIMI. | |
# @todo For organizations without any email address, HTML-ify their | |
# postal address and put it in the notes section. | |
# @todo Investigate why we get BAIEJAMES in some cases. | |
require 'csv' | |
require 'active_support/inflector' | |
if ARGV.include?('--clobber') || !File.exist?('CAI_liste_resp_acces.pdf') | |
`curl -O http://www.cai.gouv.qc.ca/documents/CAI_liste_resp_acces.pdf` | |
end | |
block = [] | |
block_1 = nil | |
block_2 = nil | |
type = nil | |
ignore = /\A(Dernière mise à jour : \d{4}-\d{2}-\d{2} \d{2}:\d{2}|Page \d+|Répertoire des organismes assujettis et des responsables de l'accès aux documents des organismes publics et de la protection des renseignements personnels)\z/ | |
$tags = { | |
"AGENCES DE LA SANTÉ" => ['santé'], | |
"AUTRES ORGANISMES GOUVERNEMENTAUX" => [], | |
"CÉGEPS" => [], | |
"CENTRE DE COMMUNICATIONS SANTÉ (911)" => [], | |
"CENTRE DE SANTÉ ET DE SERVICES SOCIAUX (CSSS)" => [], | |
"CENTRES D'HÉBERGEMENT ET DE RÉADAPTATION" => [], | |
"CENTRES HOSPITALIERS" => ['santé', 'hôpitaux'], | |
"CENTRES JEUNESSE" => ['jeunesse'], | |
"COMMISSIONS SCOLAIRES" => [], | |
"ÉTABLISSEMENTS PRIVÉS SUBVENTIONNÉS" => [], | |
"MINISTÈRES" => [], | |
"MUNICIPALITÉS" => [], "MUNICIPALITÉS RÉGIONALES DE COMTÉ (MRC)" => [], | |
"OFFICES MUNICIPAUX D'HABITATION" => [], | |
"ORDRES PROFESSIONNELS" => [], | |
"ORGANISMES MUNICIPAUX" => [], | |
"RÉGIES INTERMUNICIPALES" => [], | |
"UNIVERSITÉS" => [], | |
} | |
# Finds the element in the array that matches the regular expression, removes | |
# that element from the array, and returns the first capturing group matched by | |
# the regular expression. | |
# | |
# @param [Array] array an array | |
# @param [Regexp] regexp a regular expression | |
# @return [String] the first capturing group | |
def find_and_delete(array, regexp) | |
index = array.index{|x| x[regexp]} | |
index && array.delete_at(index)[regexp, 1] | |
end | |
organizations = [] | |
`pdftotext CAI_liste_resp_acces.pdf -`.split("\n").each do |line| | |
line.strip! | |
# Collect a block, then parse it. | |
if line.empty? | |
text = block * ' ' | |
unless text.empty? || text[ignore] | |
# The first block is the table of contents. | |
if block_1.nil? | |
block_1 = block | |
# The second block is the page numbers for the table of contents. | |
elsif block_2.nil? | |
block_2 = block | |
else | |
# The first line of a block is sometimes an item from the table of contents. | |
if block_1.include?(block.first) | |
type = block.first | |
block.shift | |
end | |
organization = { | |
organization: [], | |
name: nil, | |
role: [], | |
address: [], | |
voice: find_and_delete(block, /\ATél\. : ([\d# -]+)\z/), | |
fax: find_and_delete(block, /\ATéléc\. : ([\d# -]+)\z/), | |
tollfree: find_and_delete(block, /\ASans frais : ([\d -]+)\z/), | |
email: find_and_delete(block, /\A(\S+@\S+)\z/), | |
type: type, | |
} | |
block.each_with_index do |x,index| | |
# Ensure that organizations are read before names, and roles before | |
# addresses. Addresses swallow whatever is left. | |
if organization[:name].nil? | |
if x[/\A[\p{Lu}\p{N}\p{Punct}\p{Space}]+\z/] | |
organization[:organization] << x | |
else | |
organization[:name] = x | |
end | |
elsif organization[:address].empty? | |
if x[/\A\d|\bC\.P\. /] | |
organization[:address] << x | |
else | |
organization[:role] << x | |
end | |
else | |
organization[:address] << x | |
end | |
end | |
organization[:organization] *= ' ' | |
organization[:role] *= ' ' | |
organization[:address] *= ' ' | |
organizations << organization | |
end | |
end | |
block = [] | |
else | |
block << line | |
end | |
end | |
# Alaveteli does not support multiple contacts per public body. | |
safe = organizations.uniq do |x| | |
x[:organization] | |
end | |
puts "%4d organizations" % organizations.size | |
puts "%4d voice" % organizations.count{|x| x[:voice]} | |
puts "%4d fax" % organizations.count{|x| x[:fax]} | |
puts "%4d email" % organizations.count{|x| x[:email]} | |
puts "%4d safe" % safe.size | |
def output_to_csv(array, csv) | |
csv << ["#id", "name", "request_email", "tag_string"] | |
# Also supported: notes, publication_scheme, home_page | |
# Supported but undocumented: short_name, disclosure_log | |
id = 1 | |
array.each do |organization| | |
# Skip organizations without email addresses for now. | |
# TODO: HTML-ify their postal address and put into "notes". | |
next unless organization[:email] | |
fail "no name for %s" % organization unless organization[:organization] | |
tag_string = $tags.fetch(organization[:type]).join(" ") | |
csv << [id, organization[:organization], organization[:email], tag_string] | |
id += 1 | |
end | |
end | |
CSV.open('organizations.csv', 'w') do |csv| | |
output_to_csv(organizations, csv) | |
end | |
CSV.open('organizations-alaveteli-safe.csv', 'w') do |csv| | |
output_to_csv(safe, csv) | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
This has been superseded by https://github.com/opennorth/publicbodies-scrapers