Created
June 7, 2015 14:25
-
-
Save lumbric/b9d7e5dee5a4c8e2d15c to your computer and use it in GitHub Desktop.
A very fast web scraping script to get a CSV with Volxkino events. Importable to Google calendar
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""A very fast web scraping script to get a CSV with Volxkino events. | |
Importable to Google calendar. | |
Script written 2015. | |
[email protected] | |
Reuqirments: | |
- BetautifulSoup 4: https://pypi.python.org/pypi/beautifulsoup4 | |
""" | |
import csv | |
import codecs | |
import cStringIO | |
import itertools | |
import locale | |
from datetime import datetime | |
from bs4 import BeautifulSoup | |
import requests | |
URL = "http://volxkino.at/events/bevorstehend/page/{}/" | |
start_date = [] | |
start_time = [] | |
location = [] | |
title = [] | |
link = [] | |
description = [] | |
class CsvUnicodeWriter(object): | |
""" | |
A CSV writer which will write rows to CSV file "f", | |
which is encoded in the given encoding. | |
""" | |
def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds): | |
# Redirect output to a queue | |
self.queue = cStringIO.StringIO() | |
self.writer = csv.writer(self.queue, dialect=dialect, **kwds) | |
self.stream = f | |
self.encoder = codecs.getincrementalencoder(encoding)() | |
def writerow(self, row): | |
self.writer.writerow([s.encode("utf-8") for s in row]) | |
# Fetch UTF-8 output from the queue ... | |
data = self.queue.getvalue() | |
data = data.decode("utf-8") | |
# ... and reencode it into the target encoding | |
data = self.encoder.encode(data) | |
# write to the target stream | |
self.stream.write(data) | |
# empty queue | |
self.queue.truncate(0) | |
def writerows(self, rows): | |
for row in rows: | |
self.writerow(row) | |
# for date parsing... | |
locale.setlocale(locale.LC_TIME, ("de_AT.UTF-8", "de_AT.UTF-8")) | |
for page_num in itertools.count(start=1): | |
page = requests.get(URL.format(page_num)) | |
soup = BeautifulSoup(page.text) | |
if not soup.select('div.tribe-events-event'): | |
print "page {} empty".format(page_num) | |
break | |
for event in soup.select('div.tribe-events-event'): | |
start_datetime = event.select( | |
'.tribe-events-event-list-meta td')[1].get_text().strip() | |
dt = " ".join(start_datetime.split()[:-1]) | |
start_date.append(datetime.strptime(dt, "%d. %B %Y" | |
).strftime("%Y-%m-%d")) | |
start_time.append(" ".join(start_datetime.split()[-1:])) | |
title.append(event.select('h2.entry-title')[0].get_text()) | |
location.append(event.select( | |
'.tribe-events-event-list-meta td')[-1].get_text().strip()) | |
# no link field for google calendar, prepend to description | |
link = event.select('h2.entry-title a')[0].attrs['href'] | |
desc = event.select('div.entry-content')[0].get_text() | |
description.append("%s\n\n%s" % (link, desc)) | |
# possible fields: | |
# Subject, Start Date, Start Time, End Date, End Time, All Day Event, | |
# Reminder On/Off, Reminder Date, Reminder Time, Meeting Organizer, | |
# Description, Location, Private | |
csv_file = open("volxkino_calendar.csv", "w") | |
csv_writer = CsvUnicodeWriter(csv_file, quotechar='"', delimiter=',', | |
quoting=csv.QUOTE_ALL) | |
csv_writer.writerow(["Start Date", "Start Time", "Subject", "Location", | |
"Description"]) | |
csv_writer.writerows(zip(start_date, start_time, title, location, description)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment