Skip to content

Instantly share code, notes, and snippets.

@lumbric
Created June 7, 2015 14:25
Show Gist options
  • Save lumbric/b9d7e5dee5a4c8e2d15c to your computer and use it in GitHub Desktop.
Save lumbric/b9d7e5dee5a4c8e2d15c to your computer and use it in GitHub Desktop.
A very fast web scraping script to get a CSV with Volxkino events. Importable to Google calendar
"""A very fast web scraping script to get a CSV with Volxkino events.
Importable to Google calendar.
Script written 2015.
[email protected]
Reuqirments:
- BetautifulSoup 4: https://pypi.python.org/pypi/beautifulsoup4
"""
import csv
import codecs
import cStringIO
import itertools
import locale
from datetime import datetime
from bs4 import BeautifulSoup
import requests
URL = "http://volxkino.at/events/bevorstehend/page/{}/"
start_date = []
start_time = []
location = []
title = []
link = []
description = []
class CsvUnicodeWriter(object):
"""
A CSV writer which will write rows to CSV file "f",
which is encoded in the given encoding.
"""
def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds):
# Redirect output to a queue
self.queue = cStringIO.StringIO()
self.writer = csv.writer(self.queue, dialect=dialect, **kwds)
self.stream = f
self.encoder = codecs.getincrementalencoder(encoding)()
def writerow(self, row):
self.writer.writerow([s.encode("utf-8") for s in row])
# Fetch UTF-8 output from the queue ...
data = self.queue.getvalue()
data = data.decode("utf-8")
# ... and reencode it into the target encoding
data = self.encoder.encode(data)
# write to the target stream
self.stream.write(data)
# empty queue
self.queue.truncate(0)
def writerows(self, rows):
for row in rows:
self.writerow(row)
# for date parsing...
locale.setlocale(locale.LC_TIME, ("de_AT.UTF-8", "de_AT.UTF-8"))
for page_num in itertools.count(start=1):
page = requests.get(URL.format(page_num))
soup = BeautifulSoup(page.text)
if not soup.select('div.tribe-events-event'):
print "page {} empty".format(page_num)
break
for event in soup.select('div.tribe-events-event'):
start_datetime = event.select(
'.tribe-events-event-list-meta td')[1].get_text().strip()
dt = " ".join(start_datetime.split()[:-1])
start_date.append(datetime.strptime(dt, "%d. %B %Y"
).strftime("%Y-%m-%d"))
start_time.append(" ".join(start_datetime.split()[-1:]))
title.append(event.select('h2.entry-title')[0].get_text())
location.append(event.select(
'.tribe-events-event-list-meta td')[-1].get_text().strip())
# no link field for google calendar, prepend to description
link = event.select('h2.entry-title a')[0].attrs['href']
desc = event.select('div.entry-content')[0].get_text()
description.append("%s\n\n%s" % (link, desc))
# possible fields:
# Subject, Start Date, Start Time, End Date, End Time, All Day Event,
# Reminder On/Off, Reminder Date, Reminder Time, Meeting Organizer,
# Description, Location, Private
csv_file = open("volxkino_calendar.csv", "w")
csv_writer = CsvUnicodeWriter(csv_file, quotechar='"', delimiter=',',
quoting=csv.QUOTE_ALL)
csv_writer.writerow(["Start Date", "Start Time", "Subject", "Location",
"Description"])
csv_writer.writerows(zip(start_date, start_time, title, location, description))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment