mstepniowski · September 19, 2010 11:20
diff --git a/fn2ics.py b/fn2ics.py
 # -*- coding: utf-8 -*-
 # Works on the `output.xml` generated by running `pdf2txt.py -o output.xml FILE`, 
 # where FILE is the programme of Science Festival in Warsaw in PDF.
 #
 # Required libraries:
 #
 #  * vobject <http://vobject.skyhouseconsulting.com/>
 #  * lxml <http://codespeak.net/lxml/>
 # 
 # Script `pdf2txt.py` is part of package PDFMiner <http://www.unixuser.org/~euske/python/pdfminer/index.html>.
 #
 # Copyright (c) 2010, Marek Stępniowski
 # All rights reserved.
 # 
 # Redistribution and use in source and binary forms, with or without modification, are permitted provided that
 # the following conditions are met:
 # 
 #  * Redistributions of source code must retain the above copyright notice, this list of conditions
 #    and the following disclaimer. 
 #  * Redistributions in binary form must reproduce the above copyright notice, this list of conditions 
 #    and the following disclaimer in the documentation and/or other materials provided with the distribution.
 #  * Neither the name of Marek Stępniowski nor the names of other contributors may be used to endorse
 #    or promote products derived from this software without specific prior written permission. 
 # 
 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
 # WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A 
 # PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 
 # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 
 # (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; 
 # OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 
 # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, 
 # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 # 
 from lxml import etree
 from os import path
 import vobject
 import datetime
 import re
 import pprint


 DEFAULT_DURATION = datetime.timedelta(hours=1)


 POLISH_LETTERS = {u'ą': 'a', u'ć': 'c', u'ę': 'e', u'ł': u'l', u'ó': 'o', u'ś': 's', u'ż': 'z', u'ź': 'z',}
 def dumb_slugify(text):
    text = text.lower()
    result = []
    for letter in text:
        result.append(POLISH_LETTERS.get(letter, letter))
    result_text = ''.join(result)
    result_text = re.sub(r'[^a-zA-Z0-9-.]', '-', result_text)
    return re.sub(r'-+', '-', result_text)


 def is_int(s):
    try:
        int(s)
    except ValueError:
        return False
    else:
        return True


 def get_text(textline):
    """Returns the text of the passed `textline`."""
    result = []
    for e in textline.findall('text'):
        result.append(e.text)
    return ''.join(result).strip()


 def get_type(textline):
    """Returns the type of the passed `textline`.
    
    Available types are:
      * `section` - name of the section
      * `place` - place where the following events take place
      * `event` - basic information about the event
      * `subtitle` - some ignorable text heading
      * `default`
     
    Unfortunately `place` cannot be discerned from `default`
    """
    first_text_element = textline.find('text')
    first_text_element_size = float(first_text_element.attrib['size'])
    if 'Italic' in first_text_element.attrib['font']:
        return 'subtitle'
    if first_text_element_size > 13 and is_int(first_text_element.text):
        return 'event'
    if first_text_element_size > 14.5:
        return 'section'

    return 'default'


 def iterevents(textlines):
    """Returns event information contained in textlines."""
    #
    # States:
    #   * start
    #   * section_name
    #   * place_name
    #   * event_name
    #   * event_description
    #
    section_name = ''
    place_name = ''
    event_name = ''
    event_description = ''
    date_found = False
    
    def current_event():
        return dict(section_name=section_name, 
                    place_name=place_name,
                    event_name=event_name,
                    event_description=event_description)
    
    state = 'start'
    for line in textlines:
        line_type = get_type(line)
        line_text = get_text(line)
        
        if state in ('start', 'section_name', 'place_name'):
            if line_type == 'section':
                if state == 'section_name':
                    section_name += ' ' + line_text
                else:
                    section_name = line_text
                state = 'section_name'
            elif line_type == 'event':
                event_name = line_text
                event_description = ''
                state = 'event_name'
                date_found = (find_date(event_name, 1) is not None)
            elif line_type == 'default':
                if state == 'place_name':
                    place_name += ' ' + line_text
                else:
                    place_name = line_text
                state = 'place_name'
        
        elif state == 'event_name':
            if line_type == 'section':
                yield current_event()
                section_name = line_text
                state = 'section_name'
            elif line_type == 'event':
                yield current_event()
                event_name = line_text
                event_description = ''
                date_found = (find_date(event_name, 1) is not None)
            elif line_type == 'default':
                if not date_found:
                    event_name += ' ' + line_text
                    date_found = (find_date(event_name) is not None) # A little less zealous
                    state = 'event_name'
                elif len(line_text) == 0:
                    yield current_event()
                    state = 'start'
                else:
                    event_description = line_text
                    state = 'event_description'
        
        elif state == 'event_description':
            if line_type == 'section':
                yield current_event()
                section_name = line_text
                state = 'section_name'
            elif line_type == 'event':
                yield current_event()
                state = 'event_name'
                event_name = line_text
                date_found = (find_date(event_name, 1) is not None)
            elif line_type == 'default':
                if len(line_text) == 0:
                    yield current_event()
                    state = 'start'
                else:
                    event_description += ' ' + line_text



 DATE_RE = [
    r'(?P<day>\d+) ?IX (?:od )?godz\.? ?(?P<hours>\d+([\.:]\d+)?(-\d+([\.:]\d+)?)?( ?[, ] ?(\d+([\.:]\d+)?(-\d+([\.:]\d+)?)?)?(?! ?\d? ?IX))*)',
    r'(?P<day>\d+) ?(IX|i (?=\d+))(?! godz.)',
 ]

 def find_date(text, pattern_number=None):
    # Cases that should be handled:
    # 18 IX godz. 18.30
    # 19 IX godz. 16:30
    # 22 IX,
    # 24 IX od godz. 18,
    # 24 IX od godz. 20,
    # 18 IX godz. 10-14
    # 18 IX godz. 11, 25 IX godz. 11
    # 25 IX godz. 10, 11, 12, 13, 26 IX godz. 10, 11, 12, 13
    # 25 IX godz. 13,14,15, 26 IX godz. 13,14 15
    # 25 IX
    # 18 i 19 IX
    text = re.sub(r'\s+', ' ', text).strip()
    result = []
    patterns = DATE_RE[:pattern_number] if pattern_number else DATE_RE
    for pattern in patterns:
        match = re.search(pattern, text)
        if match:
            while match:
                result.append(match.groupdict())
                text = text[match.end():]
                match = re.search(pattern, text)
            return result
    return None


 HOUR_RE = r'(?P<start_hour>\d+([\.:]\d+)?)(?:-(?P<end_hour>\d+([\.:]\d+)?))?'

 def parse_hours(hours):
    """Parses hours string returned from `find_date` method"""
    if hours is None:
        return []
    result = []
    while True:
        match = re.search(HOUR_RE, hours)
        if match is None:
            return result
        hours = hours[match.end():]
        result.append({
            'start': split_hour(match.groupdict()['start_hour'], default=(0, 0)),
            'end': split_hour(match.groupdict()['end_hour'], default=None)
        })


 def split_hour(hour, default=None):
    if hour is None:
        return default
    if '.' in hour:
        return [int(s) for s in hour.split('.')]
    if ':' in hour:
        return [int(s) for s in hour.split(':')]
    else:
        return (int(hour), 0)


 def date_from_dict(d):
    """Returns datetime object corresponding to `d` object returned from `find_date` method."""
    hours = parse_hours(d.get('hours'))
    if len(hours) == 0:
        return [datetime.date(year=2010, month=9, day=int(d['day']))]
    else:
        result = []
        for hour in hours:
            try:
                start_date = datetime.datetime(year=2010, month=9, day=int(d['day']), hour=hour['start'][0], minute=hour['start'][1])
                if hour['end']:
                    end_date = datetime.datetime(year=2010, month=9, day=int(d['day']), hour=hour['end'][0], minute=hour['end'][1])
                else:
                    end_date = start_date + DEFAULT_DURATION
                result.append({
                    'start': start_date,
                    'end': end_date,
                })
            except ValueError, e:
                sys.stderr.write('Invalid value for hour: %r\n' % hour)
                pass
        return result


 def parse_title(event_name):
    """Extracts event title from the passed `event_name`."""
    parts = re.split(ur'\u2014|\u2013|-', event_name, 1)
    title_and_number = parts[0]
    rest = parts[1] if len(parts) == 2 else ''
    match = re.match(r'(\d+)\.?\s*(.*)', title_and_number)
    if match is None:
        import sys
        sys.stderr.write(result.encode('utf-8'))
    return {
        'number': match.group(1),
        'name': match.group(2),
        'datetime': find_date(rest),
        'rest': rest,
    }


 def main(input_file_name, output_file_name):
    doc = etree.parse(input_file_name)
    output_file = file(output_file_name, 'w')
    lines = doc.findall('//textline')
    calendar = vobject.iCalendar()
    
    sections = {}
    
    for event in iterevents(lines):
        try:
            title_parts = parse_title(event['event_name'])
            possible_dates = find_date(title_parts['rest'])
            if possible_dates is None:
                possible_dates = find_date(title_parts['name'])
            if possible_dates is None:
                time = []
            else:
                time = sum([date_from_dict(d) for d in possible_dates], [])
            
            section_calendar = sections.get(event['section_name'])
            if section_calendar is None:
                section_calendar = sections[event['section_name']] = vobject.iCalendar()

            for c in (calendar, section_calendar):                
                for t in time:
                    vevent = c.add('vevent')
                    vevent.add('summary').value = title_parts['name']
                    vevent.add('description').value = event['event_name'] + '\n\n' + event['event_description']
                    vevent.add('location').value = event['place_name']
                    if isinstance(t, dict):
                        vevent.add('dtstart').value = t['start']
                        vevent.add('dtend').value = t['end']
                    else:
                        vevent.add('dtstart').value = t
        
        except Exception, e:
            sys.stderr.write('Error writing event: %r\n\n' % pprint.pformat(event))
            raise
        
    calendar.add('X-WR-CALNAME').value = 'Festiwal Naukowy'
    output_file.write(calendar.serialize())
    output_file.close()

    base_name, ext = path.splitext(output_file_name)
    for section_name, c in sections.items():
        c.add('X-WR-CALNAME').value = section_name.title()
        section_file = file(dumb_slugify('%s-%s' % (base_name, section_name)) + ext, 'w')
        section_file.write(c.serialize())
        section_file.close()


 if __name__ == '__main__':
    import sys
    if len(sys.argv) < 3:
        print "This script requires an INPUT_FILE and OUTPUT_FILE arguments!"
    main(sys.argv[1], sys.argv[2])
	# -- coding: utf-8 --
	# Works on the `output.xml` generated by running `pdf2txt.py -o output.xml FILE`,
	# where FILE is the programme of Science Festival in Warsaw in PDF.
	#
	# Required libraries:
	#
	# * vobject <http://vobject.skyhouseconsulting.com/>
	# * lxml <http://codespeak.net/lxml/>
	#
	# Script `pdf2txt.py` is part of package PDFMiner <http://www.unixuser.org/~euske/python/pdfminer/index.html>.
	#
	# Copyright (c) 2010, Marek Stępniowski
	# All rights reserved.
	#
	# Redistribution and use in source and binary forms, with or without modification, are permitted provided that
	# the following conditions are met:
	#
	# * Redistributions of source code must retain the above copyright notice, this list of conditions
	# and the following disclaimer.
	# * Redistributions in binary form must reproduce the above copyright notice, this list of conditions
	# and the following disclaimer in the documentation and/or other materials provided with the distribution.
	# * Neither the name of Marek Stępniowski nor the names of other contributors may be used to endorse
	# or promote products derived from this software without specific prior written permission.
	#
	# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
	# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
	# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
	# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
	# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
	# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
	# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	#
	from lxml import etree
	from os import path
	import vobject
	import datetime
	import re
	import pprint


	DEFAULT_DURATION = datetime.timedelta(hours=1)


	POLISH_LETTERS = {u'ą': 'a', u'ć': 'c', u'ę': 'e', u'ł': u'l', u'ó': 'o', u'ś': 's', u'ż': 'z', u'ź': 'z',}
	def dumb_slugify(text):
	text = text.lower()
	result = []
	for letter in text:
	result.append(POLISH_LETTERS.get(letter, letter))
	result_text = ''.join(result)
	result_text = re.sub(r'[^a-zA-Z0-9-.]', '-', result_text)
	return re.sub(r'-+', '-', result_text)


	def is_int(s):
	try:
	int(s)
	except ValueError:
	return False
	else:
	return True


	def get_text(textline):
	"""Returns the text of the passed `textline`."""
	result = []
	for e in textline.findall('text'):
	result.append(e.text)
	return ''.join(result).strip()


	def get_type(textline):
	"""Returns the type of the passed `textline`.

	Available types are:
	* `section` - name of the section
	* `place` - place where the following events take place
	* `event` - basic information about the event
	* `subtitle` - some ignorable text heading
	* `default`

	Unfortunately `place` cannot be discerned from `default`
	"""
	first_text_element = textline.find('text')
	first_text_element_size = float(first_text_element.attrib['size'])
	if 'Italic' in first_text_element.attrib['font']:
	return 'subtitle'
	if first_text_element_size > 13 and is_int(first_text_element.text):
	return 'event'
	if first_text_element_size > 14.5:
	return 'section'

	return 'default'


	def iterevents(textlines):
	"""Returns event information contained in textlines."""
	#
	# States:
	# * start
	# * section_name
	# * place_name
	# * event_name
	# * event_description
	#
	section_name = ''
	place_name = ''
	event_name = ''
	event_description = ''
	date_found = False

	def current_event():
	return dict(section_name=section_name,
	place_name=place_name,
	event_name=event_name,
	event_description=event_description)

	state = 'start'
	for line in textlines:
	line_type = get_type(line)
	line_text = get_text(line)

	if state in ('start', 'section_name', 'place_name'):
	if line_type == 'section':
	if state == 'section_name':
	section_name += ' ' + line_text
	else:
	section_name = line_text
	state = 'section_name'
	elif line_type == 'event':
	event_name = line_text
	event_description = ''
	state = 'event_name'
	date_found = (find_date(event_name, 1) is not None)
	elif line_type == 'default':
	if state == 'place_name':
	place_name += ' ' + line_text
	else:
	place_name = line_text
	state = 'place_name'

	elif state == 'event_name':
	if line_type == 'section':
	yield current_event()
	section_name = line_text
	state = 'section_name'
	elif line_type == 'event':
	yield current_event()
	event_name = line_text
	event_description = ''
	date_found = (find_date(event_name, 1) is not None)
	elif line_type == 'default':
	if not date_found:
	event_name += ' ' + line_text
	date_found = (find_date(event_name) is not None) # A little less zealous
	state = 'event_name'
	elif len(line_text) == 0:
	yield current_event()
	state = 'start'
	else:
	event_description = line_text
	state = 'event_description'

	elif state == 'event_description':
	if line_type == 'section':
	yield current_event()
	section_name = line_text
	state = 'section_name'
	elif line_type == 'event':
	yield current_event()
	state = 'event_name'
	event_name = line_text
	date_found = (find_date(event_name, 1) is not None)
	elif line_type == 'default':
	if len(line_text) == 0:
	yield current_event()
	state = 'start'
	else:
	event_description += ' ' + line_text



	DATE_RE = [
	r'(?P<day>\d+) ?IX (?:od )?godz\.? ?(?P<hours>\d+([\.:]\d+)?(-\d+([\.:]\d+)?)?( ?[, ] ?(\d+([\.:]\d+)?(-\d+([\.:]\d+)?)?)?(?! ?\d? ?IX))*)',
	r'(?P<day>\d+) ?(IX\|i (?=\d+))(?! godz.)',
	]

	def find_date(text, pattern_number=None):
	# Cases that should be handled:
	# 18 IX godz. 18.30
	# 19 IX godz. 16:30
	# 22 IX,
	# 24 IX od godz. 18,
	# 24 IX od godz. 20,
	# 18 IX godz. 10-14
	# 18 IX godz. 11, 25 IX godz. 11
	# 25 IX godz. 10, 11, 12, 13, 26 IX godz. 10, 11, 12, 13
	# 25 IX godz. 13,14,15, 26 IX godz. 13,14 15
	# 25 IX
	# 18 i 19 IX
	text = re.sub(r'\s+', ' ', text).strip()
	result = []
	patterns = DATE_RE[:pattern_number] if pattern_number else DATE_RE
	for pattern in patterns:
	match = re.search(pattern, text)
	if match:
	while match:
	result.append(match.groupdict())
	text = text[match.end():]
	match = re.search(pattern, text)
	return result
	return None


	HOUR_RE = r'(?P<start_hour>\d+([\.:]\d+)?)(?:-(?P<end_hour>\d+([\.:]\d+)?))?'

	def parse_hours(hours):
	"""Parses hours string returned from `find_date` method"""
	if hours is None:
	return []
	result = []
	while True:
	match = re.search(HOUR_RE, hours)
	if match is None:
	return result
	hours = hours[match.end():]
	result.append({
	'start': split_hour(match.groupdict()['start_hour'], default=(0, 0)),
	'end': split_hour(match.groupdict()['end_hour'], default=None)
	})


	def split_hour(hour, default=None):
	if hour is None:
	return default
	if '.' in hour:
	return [int(s) for s in hour.split('.')]
	if ':' in hour:
	return [int(s) for s in hour.split(':')]
	else:
	return (int(hour), 0)


	def date_from_dict(d):
	"""Returns datetime object corresponding to `d` object returned from `find_date` method."""
	hours = parse_hours(d.get('hours'))
	if len(hours) == 0:
	return [datetime.date(year=2010, month=9, day=int(d['day']))]
	else:
	result = []
	for hour in hours:
	try:
	start_date = datetime.datetime(year=2010, month=9, day=int(d['day']), hour=hour['start'][0], minute=hour['start'][1])
	if hour['end']:
	end_date = datetime.datetime(year=2010, month=9, day=int(d['day']), hour=hour['end'][0], minute=hour['end'][1])
	else:
	end_date = start_date + DEFAULT_DURATION
	result.append({
	'start': start_date,
	'end': end_date,
	})
	except ValueError, e:
	sys.stderr.write('Invalid value for hour: %r\n' % hour)
	pass
	return result


	def parse_title(event_name):
	"""Extracts event title from the passed `event_name`."""
	parts = re.split(ur'\u2014\|\u2013\|-', event_name, 1)
	title_and_number = parts[0]
	rest = parts[1] if len(parts) == 2 else ''
	match = re.match(r'(\d+)\.?\s(.)', title_and_number)
	if match is None:
	import sys
	sys.stderr.write(result.encode('utf-8'))
	return {
	'number': match.group(1),
	'name': match.group(2),
	'datetime': find_date(rest),
	'rest': rest,
	}


	def main(input_file_name, output_file_name):
	doc = etree.parse(input_file_name)
	output_file = file(output_file_name, 'w')
	lines = doc.findall('//textline')
	calendar = vobject.iCalendar()

	sections = {}

	for event in iterevents(lines):
	try:
	title_parts = parse_title(event['event_name'])
	possible_dates = find_date(title_parts['rest'])
	if possible_dates is None:
	possible_dates = find_date(title_parts['name'])
	if possible_dates is None:
	time = []
	else:
	time = sum([date_from_dict(d) for d in possible_dates], [])

	section_calendar = sections.get(event['section_name'])
	if section_calendar is None:
	section_calendar = sections[event['section_name']] = vobject.iCalendar()

	for c in (calendar, section_calendar):
	for t in time:
	vevent = c.add('vevent')
	vevent.add('summary').value = title_parts['name']
	vevent.add('description').value = event['event_name'] + '\n\n' + event['event_description']
	vevent.add('location').value = event['place_name']
	if isinstance(t, dict):
	vevent.add('dtstart').value = t['start']
	vevent.add('dtend').value = t['end']
	else:
	vevent.add('dtstart').value = t

	except Exception, e:
	sys.stderr.write('Error writing event: %r\n\n' % pprint.pformat(event))
	raise

	calendar.add('X-WR-CALNAME').value = 'Festiwal Naukowy'
	output_file.write(calendar.serialize())
	output_file.close()

	base_name, ext = path.splitext(output_file_name)
	for section_name, c in sections.items():
	c.add('X-WR-CALNAME').value = section_name.title()
	section_file = file(dumb_slugify('%s-%s' % (base_name, section_name)) + ext, 'w')
	section_file.write(c.serialize())
	section_file.close()


	if __name__ == '__main__':
	import sys
	if len(sys.argv) < 3:
	print "This script requires an INPUT_FILE and OUTPUT_FILE arguments!"
	main(sys.argv[1], sys.argv[2])