Created
August 9, 2012 13:53
-
-
Save federomero/3304395 to your computer and use it in GitHub Desktop.
Uruguayan constitution markdown formatter
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# encoding: UTF-8 | |
require 'open-uri' | |
years = %w{1997 1994 1989 1967 1952 1942 1934 1918 1830} | |
i = years.last | |
file = open("http://www0.parlamento.gub.uy/constituciones/const#{i.slice(1..-1)}.htm").read | |
t = file.encode('UTF-8', 'ISO-8859-1', universal_newline: true) | |
# Remove everything up to first h2 tag | |
t.gsub!(/.*?(?=<h2)/m, "") | |
# Remove everything after webbot comment | |
t.gsub!(/<!--webbot.*/im, "") | |
# Strip html comments | |
t.gsub!(/<!--(.*)-->\n/, '') | |
# Remove br tags | |
t.gsub!(/<br>\n?/, ' ') | |
# Set main header | |
t.gsub!(/<h2[^>]*>CONSTITUCION DE LA REPUBLICA<\/h2>/, '# CONSTITUCION DE LA REPUBLICA') | |
# Set explanatory text as 4th level header | |
t.gsub!(/<h4[^>]*>(CONSTITUCION .+)<\/h4>/, '#### \1') | |
# Remove line breaks in h4 tags | |
t.gsub!(/(<h4[^>]*>[^<]*)\n/, '\1 ') | |
# Set sections as second level headers | |
t.gsub!(/<h4[^>]*>(SECCION .*)<\/h4>/, '## \1') | |
# Set chapters as third level headers | |
t.gsub!(/<h4[^>]*>(CAPITULO .*)<\/h4>/, '### \1') | |
# Set rest of h4 as 4th level headers | |
t.gsub!(/<h4[^>]*>([^<]*)<\/h4>/m, '#### \1') | |
# Replace hr with --- | |
t.gsub!(/<hr[^>]*>/, '---') | |
# Remove p tags | |
t.gsub!('<p>', '').gsub!('</p>', '') | |
# Handle article names | |
t.gsub!(/<u><a[^>]*>(.*)<\/a><\/u>º?/, '**\1º**') | |
# Remove article links | |
t.gsub!(/<a[^>]*>(.*)<\/a>/, '\1') | |
# Format tables as numbered lists | |
t.gsub!(/<\/?table.*>\n/, '') | |
t.gsub!(/ *<tr>/, '') | |
t.gsub!(/ *<\/tr>/, '') | |
t.gsub!(/ *<td[^>]*>([0-9]+)(º|\.)<\/td>\n/, '\1.') | |
t.gsub!(/ *<td[^>]*>([^<]*)<\/td>/m, ' \1') | |
puts t |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment