lumbric · June 7, 2015 14:25
diff --git a/parse_volxkino.py b/parse_volxkino.py
 """A very fast web scraping script to get a CSV with Volxkino events.
 Importable to Google calendar.

 Script written 2015.

 [email protected]

 Reuqirments:
 - BetautifulSoup 4: https://pypi.python.org/pypi/beautifulsoup4

 """

 import csv
 import codecs
 import cStringIO
 import itertools
 import locale
 from datetime import datetime
 from bs4 import BeautifulSoup
 import requests


 URL = "http://volxkino.at/events/bevorstehend/page/{}/"

 start_date = []
 start_time = []
 location = []
 title = []
 link = []
 description = []


 class CsvUnicodeWriter(object):
    """
    A CSV writer which will write rows to CSV file "f",
    which is encoded in the given encoding.
    """
    def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds):
        # Redirect output to a queue
        self.queue = cStringIO.StringIO()
        self.writer = csv.writer(self.queue, dialect=dialect, **kwds)
        self.stream = f
        self.encoder = codecs.getincrementalencoder(encoding)()

    def writerow(self, row):
        self.writer.writerow([s.encode("utf-8") for s in row])
        # Fetch UTF-8 output from the queue ...
        data = self.queue.getvalue()
        data = data.decode("utf-8")
        # ... and reencode it into the target encoding
        data = self.encoder.encode(data)
        # write to the target stream
        self.stream.write(data)
        # empty queue
        self.queue.truncate(0)

    def writerows(self, rows):
        for row in rows:
            self.writerow(row)


 # for date parsing...
 locale.setlocale(locale.LC_TIME, ("de_AT.UTF-8", "de_AT.UTF-8"))

 for page_num in itertools.count(start=1):

    page = requests.get(URL.format(page_num))
    soup = BeautifulSoup(page.text)
    if not soup.select('div.tribe-events-event'):
        print "page {} empty".format(page_num)
        break

    for event in soup.select('div.tribe-events-event'):
        start_datetime = event.select(
            '.tribe-events-event-list-meta td')[1].get_text().strip()

        dt = " ".join(start_datetime.split()[:-1])
        start_date.append(datetime.strptime(dt, "%d. %B %Y"
                                            ).strftime("%Y-%m-%d"))

        start_time.append(" ".join(start_datetime.split()[-1:]))
        title.append(event.select('h2.entry-title')[0].get_text())
        location.append(event.select(
            '.tribe-events-event-list-meta td')[-1].get_text().strip())

        # no link field for google calendar, prepend to description
        link = event.select('h2.entry-title a')[0].attrs['href']
        desc = event.select('div.entry-content')[0].get_text()
        description.append("%s\n\n%s" % (link, desc))


 # possible fields:
 # Subject, Start Date, Start Time, End Date, End Time, All Day Event,
 # Reminder On/Off, Reminder Date, Reminder Time, Meeting Organizer,
 # Description, Location, Private
 csv_file = open("volxkino_calendar.csv", "w")
 csv_writer = CsvUnicodeWriter(csv_file, quotechar='"', delimiter=',',
                              quoting=csv.QUOTE_ALL)
 csv_writer.writerow(["Start Date", "Start Time", "Subject", "Location",
                     "Description"])
 csv_writer.writerows(zip(start_date, start_time, title, location, description))
	"""A very fast web scraping script to get a CSV with Volxkino events.
	Importable to Google calendar.

	Script written 2015.

	[email protected]

	Reuqirments:
	- BetautifulSoup 4: https://pypi.python.org/pypi/beautifulsoup4

	"""

	import csv
	import codecs
	import cStringIO
	import itertools
	import locale
	from datetime import datetime
	from bs4 import BeautifulSoup
	import requests


	URL = "http://volxkino.at/events/bevorstehend/page/{}/"

	start_date = []
	start_time = []
	location = []
	title = []
	link = []
	description = []


	class CsvUnicodeWriter(object):
	"""
	A CSV writer which will write rows to CSV file "f",
	which is encoded in the given encoding.
	"""
	def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds):
	# Redirect output to a queue
	self.queue = cStringIO.StringIO()
	self.writer = csv.writer(self.queue, dialect=dialect, **kwds)
	self.stream = f
	self.encoder = codecs.getincrementalencoder(encoding)()

	def writerow(self, row):
	self.writer.writerow([s.encode("utf-8") for s in row])
	# Fetch UTF-8 output from the queue ...
	data = self.queue.getvalue()
	data = data.decode("utf-8")
	# ... and reencode it into the target encoding
	data = self.encoder.encode(data)
	# write to the target stream
	self.stream.write(data)
	# empty queue
	self.queue.truncate(0)

	def writerows(self, rows):
	for row in rows:
	self.writerow(row)


	# for date parsing...
	locale.setlocale(locale.LC_TIME, ("de_AT.UTF-8", "de_AT.UTF-8"))

	for page_num in itertools.count(start=1):

	page = requests.get(URL.format(page_num))
	soup = BeautifulSoup(page.text)
	if not soup.select('div.tribe-events-event'):
	print "page {} empty".format(page_num)
	break

	for event in soup.select('div.tribe-events-event'):
	start_datetime = event.select(
	'.tribe-events-event-list-meta td')[1].get_text().strip()

	dt = " ".join(start_datetime.split()[:-1])
	start_date.append(datetime.strptime(dt, "%d. %B %Y"
	).strftime("%Y-%m-%d"))

	start_time.append(" ".join(start_datetime.split()[-1:]))
	title.append(event.select('h2.entry-title')[0].get_text())
	location.append(event.select(
	'.tribe-events-event-list-meta td')[-1].get_text().strip())

	# no link field for google calendar, prepend to description
	link = event.select('h2.entry-title a')[0].attrs['href']
	desc = event.select('div.entry-content')[0].get_text()
	description.append("%s\n\n%s" % (link, desc))


	# possible fields:
	# Subject, Start Date, Start Time, End Date, End Time, All Day Event,
	# Reminder On/Off, Reminder Date, Reminder Time, Meeting Organizer,
	# Description, Location, Private
	csv_file = open("volxkino_calendar.csv", "w")
	csv_writer = CsvUnicodeWriter(csv_file, quotechar='"', delimiter=',',
	quoting=csv.QUOTE_ALL)
	csv_writer.writerow(["Start Date", "Start Time", "Subject", "Location",
	"Description"])
	csv_writer.writerows(zip(start_date, start_time, title, location, description))