Skip to content

Instantly share code, notes, and snippets.

@mstepniowski
Created September 19, 2010 11:20
Show Gist options
  • Save mstepniowski/586683 to your computer and use it in GitHub Desktop.
Save mstepniowski/586683 to your computer and use it in GitHub Desktop.
# -*- coding: utf-8 -*-
# Works on the `output.xml` generated by running `pdf2txt.py -o output.xml FILE`,
# where FILE is the programme of Science Festival in Warsaw in PDF.
#
# Required libraries:
#
# * vobject <http://vobject.skyhouseconsulting.com/>
# * lxml <http://codespeak.net/lxml/>
#
# Script `pdf2txt.py` is part of package PDFMiner <http://www.unixuser.org/~euske/python/pdfminer/index.html>.
#
# Copyright (c) 2010, Marek Stępniowski
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without modification, are permitted provided that
# the following conditions are met:
#
# * Redistributions of source code must retain the above copyright notice, this list of conditions
# and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright notice, this list of conditions
# and the following disclaimer in the documentation and/or other materials provided with the distribution.
# * Neither the name of Marek Stępniowski nor the names of other contributors may be used to endorse
# or promote products derived from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
from lxml import etree
from os import path
import vobject
import datetime
import re
import pprint
DEFAULT_DURATION = datetime.timedelta(hours=1)
POLISH_LETTERS = {u'ą': 'a', u'ć': 'c', u'ę': 'e', u'ł': u'l', u'ó': 'o', u'ś': 's', u'ż': 'z', u'ź': 'z',}
def dumb_slugify(text):
text = text.lower()
result = []
for letter in text:
result.append(POLISH_LETTERS.get(letter, letter))
result_text = ''.join(result)
result_text = re.sub(r'[^a-zA-Z0-9-.]', '-', result_text)
return re.sub(r'-+', '-', result_text)
def is_int(s):
try:
int(s)
except ValueError:
return False
else:
return True
def get_text(textline):
"""Returns the text of the passed `textline`."""
result = []
for e in textline.findall('text'):
result.append(e.text)
return ''.join(result).strip()
def get_type(textline):
"""Returns the type of the passed `textline`.
Available types are:
* `section` - name of the section
* `place` - place where the following events take place
* `event` - basic information about the event
* `subtitle` - some ignorable text heading
* `default`
Unfortunately `place` cannot be discerned from `default`
"""
first_text_element = textline.find('text')
first_text_element_size = float(first_text_element.attrib['size'])
if 'Italic' in first_text_element.attrib['font']:
return 'subtitle'
if first_text_element_size > 13 and is_int(first_text_element.text):
return 'event'
if first_text_element_size > 14.5:
return 'section'
return 'default'
def iterevents(textlines):
"""Returns event information contained in textlines."""
#
# States:
# * start
# * section_name
# * place_name
# * event_name
# * event_description
#
section_name = ''
place_name = ''
event_name = ''
event_description = ''
date_found = False
def current_event():
return dict(section_name=section_name,
place_name=place_name,
event_name=event_name,
event_description=event_description)
state = 'start'
for line in textlines:
line_type = get_type(line)
line_text = get_text(line)
if state in ('start', 'section_name', 'place_name'):
if line_type == 'section':
if state == 'section_name':
section_name += ' ' + line_text
else:
section_name = line_text
state = 'section_name'
elif line_type == 'event':
event_name = line_text
event_description = ''
state = 'event_name'
date_found = (find_date(event_name, 1) is not None)
elif line_type == 'default':
if state == 'place_name':
place_name += ' ' + line_text
else:
place_name = line_text
state = 'place_name'
elif state == 'event_name':
if line_type == 'section':
yield current_event()
section_name = line_text
state = 'section_name'
elif line_type == 'event':
yield current_event()
event_name = line_text
event_description = ''
date_found = (find_date(event_name, 1) is not None)
elif line_type == 'default':
if not date_found:
event_name += ' ' + line_text
date_found = (find_date(event_name) is not None) # A little less zealous
state = 'event_name'
elif len(line_text) == 0:
yield current_event()
state = 'start'
else:
event_description = line_text
state = 'event_description'
elif state == 'event_description':
if line_type == 'section':
yield current_event()
section_name = line_text
state = 'section_name'
elif line_type == 'event':
yield current_event()
state = 'event_name'
event_name = line_text
date_found = (find_date(event_name, 1) is not None)
elif line_type == 'default':
if len(line_text) == 0:
yield current_event()
state = 'start'
else:
event_description += ' ' + line_text
DATE_RE = [
r'(?P<day>\d+) ?IX (?:od )?godz\.? ?(?P<hours>\d+([\.:]\d+)?(-\d+([\.:]\d+)?)?( ?[, ] ?(\d+([\.:]\d+)?(-\d+([\.:]\d+)?)?)?(?! ?\d? ?IX))*)',
r'(?P<day>\d+) ?(IX|i (?=\d+))(?! godz.)',
]
def find_date(text, pattern_number=None):
# Cases that should be handled:
# 18 IX godz. 18.30
# 19 IX godz. 16:30
# 22 IX,
# 24 IX od godz. 18,
# 24 IX od godz. 20,
# 18 IX godz. 10-14
# 18 IX godz. 11, 25 IX godz. 11
# 25 IX godz. 10, 11, 12, 13, 26 IX godz. 10, 11, 12, 13
# 25 IX godz. 13,14,15, 26 IX godz. 13,14 15
# 25 IX
# 18 i 19 IX
text = re.sub(r'\s+', ' ', text).strip()
result = []
patterns = DATE_RE[:pattern_number] if pattern_number else DATE_RE
for pattern in patterns:
match = re.search(pattern, text)
if match:
while match:
result.append(match.groupdict())
text = text[match.end():]
match = re.search(pattern, text)
return result
return None
HOUR_RE = r'(?P<start_hour>\d+([\.:]\d+)?)(?:-(?P<end_hour>\d+([\.:]\d+)?))?'
def parse_hours(hours):
"""Parses hours string returned from `find_date` method"""
if hours is None:
return []
result = []
while True:
match = re.search(HOUR_RE, hours)
if match is None:
return result
hours = hours[match.end():]
result.append({
'start': split_hour(match.groupdict()['start_hour'], default=(0, 0)),
'end': split_hour(match.groupdict()['end_hour'], default=None)
})
def split_hour(hour, default=None):
if hour is None:
return default
if '.' in hour:
return [int(s) for s in hour.split('.')]
if ':' in hour:
return [int(s) for s in hour.split(':')]
else:
return (int(hour), 0)
def date_from_dict(d):
"""Returns datetime object corresponding to `d` object returned from `find_date` method."""
hours = parse_hours(d.get('hours'))
if len(hours) == 0:
return [datetime.date(year=2010, month=9, day=int(d['day']))]
else:
result = []
for hour in hours:
try:
start_date = datetime.datetime(year=2010, month=9, day=int(d['day']), hour=hour['start'][0], minute=hour['start'][1])
if hour['end']:
end_date = datetime.datetime(year=2010, month=9, day=int(d['day']), hour=hour['end'][0], minute=hour['end'][1])
else:
end_date = start_date + DEFAULT_DURATION
result.append({
'start': start_date,
'end': end_date,
})
except ValueError, e:
sys.stderr.write('Invalid value for hour: %r\n' % hour)
pass
return result
def parse_title(event_name):
"""Extracts event title from the passed `event_name`."""
parts = re.split(ur'\u2014|\u2013|-', event_name, 1)
title_and_number = parts[0]
rest = parts[1] if len(parts) == 2 else ''
match = re.match(r'(\d+)\.?\s*(.*)', title_and_number)
if match is None:
import sys
sys.stderr.write(result.encode('utf-8'))
return {
'number': match.group(1),
'name': match.group(2),
'datetime': find_date(rest),
'rest': rest,
}
def main(input_file_name, output_file_name):
doc = etree.parse(input_file_name)
output_file = file(output_file_name, 'w')
lines = doc.findall('//textline')
calendar = vobject.iCalendar()
sections = {}
for event in iterevents(lines):
try:
title_parts = parse_title(event['event_name'])
possible_dates = find_date(title_parts['rest'])
if possible_dates is None:
possible_dates = find_date(title_parts['name'])
if possible_dates is None:
time = []
else:
time = sum([date_from_dict(d) for d in possible_dates], [])
section_calendar = sections.get(event['section_name'])
if section_calendar is None:
section_calendar = sections[event['section_name']] = vobject.iCalendar()
for c in (calendar, section_calendar):
for t in time:
vevent = c.add('vevent')
vevent.add('summary').value = title_parts['name']
vevent.add('description').value = event['event_name'] + '\n\n' + event['event_description']
vevent.add('location').value = event['place_name']
if isinstance(t, dict):
vevent.add('dtstart').value = t['start']
vevent.add('dtend').value = t['end']
else:
vevent.add('dtstart').value = t
except Exception, e:
sys.stderr.write('Error writing event: %r\n\n' % pprint.pformat(event))
raise
calendar.add('X-WR-CALNAME').value = 'Festiwal Naukowy'
output_file.write(calendar.serialize())
output_file.close()
base_name, ext = path.splitext(output_file_name)
for section_name, c in sections.items():
c.add('X-WR-CALNAME').value = section_name.title()
section_file = file(dumb_slugify('%s-%s' % (base_name, section_name)) + ext, 'w')
section_file.write(c.serialize())
section_file.close()
if __name__ == '__main__':
import sys
if len(sys.argv) < 3:
print "This script requires an INPUT_FILE and OUTPUT_FILE arguments!"
main(sys.argv[1], sys.argv[2])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment