Created
March 9, 2014 15:41
-
-
Save probonopd/9449628 to your computer and use it in GitHub Desktop.
Create an RSS feed out of schwaebische.de orbituaries images
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/python | |
| # | |
| # Create an RSS feed out of schwaebische.de orbituaries images | |
| # | |
| import os, sys, time, datetime, urllib2, re | |
| from BeautifulSoup import BeautifulSoup | |
| debug = False | |
| headers = { 'User-Agent' : 'Mozilla/5.0' } | |
| yesterday = datetime.date.today() - datetime.timedelta(days=1) | |
| searchdate = yesterday.strftime("%d-%m-%Y") | |
| # 14 = Ravensburg | |
| def grab_page(page=1): | |
| url = "http://trauer.schwaebische.de/Anzeige-suchen/_/_/_/14/%s/_/%s" % (searchdate, page) | |
| print url | |
| req = urllib2.Request(url, None, headers) | |
| response = urllib2.urlopen(req) | |
| the_page = response.read() | |
| return the_page | |
| page = grab_page(1) | |
| soup = BeautifulSoup(''.join(page)) | |
| # Find out how many pages we need to process | |
| number_of_pages = int(soup.findAll('span', {"class": "lbl-counter-by-pager"})[0].text.split(" ")[3]) | |
| if(debug): print number_of_pages | |
| orbituaries = [] | |
| images = [] | |
| pages = [page] # There is at least one page (the one we already grabbed), but there could be more | |
| # If there is more than one page, grab the additional ones as well | |
| if (number_of_pages > 1): | |
| for x in range(2, number_of_pages+1): | |
| page = grab_page(x) | |
| pages.append(page) | |
| # Get the links to the orbituaries | |
| for page in pages: | |
| soup = BeautifulSoup(''.join(page)) | |
| for item in soup.findAll('a', {"class": "hyperLinkSearchItemTitle nounderline"}): | |
| orbituaries.append(item['href']) | |
| if(debug): print orbituaries | |
| ############ | |
| # The actual image seems to be dynamically loaded by a XMLHttpRequest POST request to | |
| # http://trauer.schwaebische.de/Microsite/Views/Apps/Startpage/Startpage.aspx | |
| # with request body | |
| # themesiteDomainName=Firstname-Lastname | |
| # However, the following also works: GET | |
| # http://trauer.schwaebische.de/Microsite/Views/Apps/Startpage/Startpage.aspx?themesiteDomainName=Firstname-Lastname | |
| def grab_orbituary(namestr): | |
| url = "http://trauer.schwaebische.de/Microsite/Views/Apps/Startpage/Startpage.aspx?themesiteDomainName=%s" % (namestr) | |
| print url | |
| req = urllib2.Request(url, None, headers) | |
| response = urllib2.urlopen(req) | |
| the_page = response.read() | |
| return the_page | |
| for orbituary in orbituaries: | |
| namestr = orbituary.split("/")[4] | |
| if(debug): print namestr | |
| page = grab_orbituary(namestr) | |
| soup = BeautifulSoup(''.join(page)) | |
| # print soup | |
| for item in soup.findAll('a', {"class": "std-hyperlink obitoryTooltipOverview"}): | |
| image = item.attrs[3][1].split("'")[3] | |
| if(debug): print image | |
| images.append(image) | |
| if(debug): print images | |
| rss="""<rss version="2.0"> | |
| <channel> | |
| <title>Traueranzeigen seit %s</title> | |
| <link>http://trauer.schwaebische.de/</link> | |
| <description>Traueranzeigen seit %s</description> | |
| """ % (yesterday.strftime("%d.%m.%Y"), yesterday.strftime("%d.%m.%Y")) | |
| for image in images: | |
| rss = rss + """ | |
| <item> | |
| <title>Traueranzeige</title> | |
| <link>%s</link> | |
| <guid>%s</guid> | |
| </item> | |
| """ % (image, image) | |
| rss = rss + """ | |
| </channel> | |
| </rss>""" | |
| with open("todesanzeigen.rss", "w") as myfile: | |
| myfile.write(rss) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment