Skip to content

Instantly share code, notes, and snippets.

@jwoschitz
Created November 28, 2013 17:57
Show Gist options
  • Save jwoschitz/7695923 to your computer and use it in GitHub Desktop.
Save jwoschitz/7695923 to your computer and use it in GitHub Desktop.
xmas-market-scraper
'''
dependencies:
requests
http://www.python-requests.org/
pip install requests
beautifulsoup4
http://www.crummy.com/software/BeautifulSoup/
pip install beautifulsoup4
'''
import sys
import requests
from bs4 import BeautifulSoup
base_url = 'http://www.weihnachteninberlin.de'
def get_soup(url):
r = requests.get(base_url + url)
return BeautifulSoup(r.text)
soup = get_soup('/weihnachtsmaerkte/')
right_col = soup.find('div', id='col3_content')
urls = {}
for link in right_col.find_all('a'):
url = link.get('href')
if url.startswith('/weihnachtsmaerkte/'):
if not url in urls and link.string is not None:
urls[url] = link.string.encode(sys.stdout.encoding, 'ignore')
data = []
for url in urls:
soup = get_soup(url)
address_element = soup.find('div', id='col3_content').find('div', class_='mod_contentteaser').find('p')
if address_element is None:
continue
address_parts = address_element.contents
address = []
for part in address_parts:
if isinstance(part, unicode):
address.append(part.strip().strip(' \t\n\r').encode(sys.stdout.encoding, 'ignore'))
data.append({
'name': urls[url],
'url': base_url + url,
'address': address
})
print data
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment