probonopd · March 9, 2014 15:41
diff --git a/schwaebische_orbituaries.py b/schwaebische_orbituaries.py
 #!/usr/bin/python

 #
 # Create an RSS feed out of schwaebische.de orbituaries images
 #

 import os, sys, time, datetime, urllib2, re
 from BeautifulSoup import BeautifulSoup

 debug = False

 headers = { 'User-Agent' : 'Mozilla/5.0' }
 yesterday = datetime.date.today() - datetime.timedelta(days=1)
 searchdate = yesterday.strftime("%d-%m-%Y")

 # 14 = Ravensburg
 def grab_page(page=1):
    url = "http://trauer.schwaebische.de/Anzeige-suchen/_/_/_/14/%s/_/%s" % (searchdate, page)
    print url
    req = urllib2.Request(url, None, headers)
    response = urllib2.urlopen(req)
    the_page = response.read()
    return the_page

 page = grab_page(1)
 soup = BeautifulSoup(''.join(page))

 # Find out how many pages we need to process
 number_of_pages = int(soup.findAll('span', {"class": "lbl-counter-by-pager"})[0].text.split(" ")[3])
 if(debug): print number_of_pages

 orbituaries = []
 images = []
 pages = [page] # There is at least one page (the one we already grabbed), but there could be more

 # If there is more than one page, grab the additional ones as well
 if (number_of_pages > 1):
    for x in range(2, number_of_pages+1):
        page = grab_page(x)
        pages.append(page)

 # Get the links to the orbituaries
 for page in pages:
    soup = BeautifulSoup(''.join(page))
    for item in soup.findAll('a', {"class": "hyperLinkSearchItemTitle nounderline"}):
        orbituaries.append(item['href'])

 if(debug): print orbituaries

 ############

 # The actual image seems to be dynamically loaded by a XMLHttpRequest POST request to
 # http://trauer.schwaebische.de/Microsite/Views/Apps/Startpage/Startpage.aspx
 # with request body
 # themesiteDomainName=Firstname-Lastname
 # However, the following also works: GET
 # http://trauer.schwaebische.de/Microsite/Views/Apps/Startpage/Startpage.aspx?themesiteDomainName=Firstname-Lastname

 def grab_orbituary(namestr):
    url = "http://trauer.schwaebische.de/Microsite/Views/Apps/Startpage/Startpage.aspx?themesiteDomainName=%s" % (namestr)
    print url
    req = urllib2.Request(url, None, headers)
    response = urllib2.urlopen(req)
    the_page = response.read()
    return the_page

 for orbituary in orbituaries:
    namestr = orbituary.split("/")[4]
    if(debug): print namestr
    page = grab_orbituary(namestr)
    soup = BeautifulSoup(''.join(page))
    # print soup
    for item in soup.findAll('a', {"class": "std-hyperlink obitoryTooltipOverview"}):
        image = item.attrs[3][1].split("'")[3]
        if(debug): print image
        images.append(image)

 if(debug): print images

 rss="""<rss version="2.0">

 <channel>
  <title>Traueranzeigen seit %s</title>
  <link>http://trauer.schwaebische.de/</link>
  <description>Traueranzeigen seit %s</description>
 """ % (yesterday.strftime("%d.%m.%Y"), yesterday.strftime("%d.%m.%Y"))

 for image in images:
    rss = rss + """
  <item>
    <title>Traueranzeige</title>
    <link>%s</link>
    <guid>%s</guid>
  </item>
 """  % (image, image)

 rss = rss + """
 </channel>

 </rss>"""



 with open("todesanzeigen.rss", "w") as myfile:
    myfile.write(rss)
	#!/usr/bin/python

	#
	# Create an RSS feed out of schwaebische.de orbituaries images
	#

	import os, sys, time, datetime, urllib2, re
	from BeautifulSoup import BeautifulSoup

	debug = False

	headers = { 'User-Agent' : 'Mozilla/5.0' }
	yesterday = datetime.date.today() - datetime.timedelta(days=1)
	searchdate = yesterday.strftime("%d-%m-%Y")

	# 14 = Ravensburg
	def grab_page(page=1):
	url = "http://trauer.schwaebische.de/Anzeige-suchen/_/_/_/14/%s/_/%s" % (searchdate, page)
	print url
	req = urllib2.Request(url, None, headers)
	response = urllib2.urlopen(req)
	the_page = response.read()
	return the_page

	page = grab_page(1)
	soup = BeautifulSoup(''.join(page))

	# Find out how many pages we need to process
	number_of_pages = int(soup.findAll('span', {"class": "lbl-counter-by-pager"})[0].text.split(" ")[3])
	if(debug): print number_of_pages

	orbituaries = []
	images = []
	pages = [page] # There is at least one page (the one we already grabbed), but there could be more

	# If there is more than one page, grab the additional ones as well
	if (number_of_pages > 1):
	for x in range(2, number_of_pages+1):
	page = grab_page(x)
	pages.append(page)

	# Get the links to the orbituaries
	for page in pages:
	soup = BeautifulSoup(''.join(page))
	for item in soup.findAll('a', {"class": "hyperLinkSearchItemTitle nounderline"}):
	orbituaries.append(item['href'])

	if(debug): print orbituaries

	############

	# The actual image seems to be dynamically loaded by a XMLHttpRequest POST request to
	# http://trauer.schwaebische.de/Microsite/Views/Apps/Startpage/Startpage.aspx
	# with request body
	# themesiteDomainName=Firstname-Lastname
	# However, the following also works: GET
	# http://trauer.schwaebische.de/Microsite/Views/Apps/Startpage/Startpage.aspx?themesiteDomainName=Firstname-Lastname

	def grab_orbituary(namestr):
	url = "http://trauer.schwaebische.de/Microsite/Views/Apps/Startpage/Startpage.aspx?themesiteDomainName=%s" % (namestr)
	print url
	req = urllib2.Request(url, None, headers)
	response = urllib2.urlopen(req)
	the_page = response.read()
	return the_page

	for orbituary in orbituaries:
	namestr = orbituary.split("/")[4]
	if(debug): print namestr
	page = grab_orbituary(namestr)
	soup = BeautifulSoup(''.join(page))
	# print soup
	for item in soup.findAll('a', {"class": "std-hyperlink obitoryTooltipOverview"}):
	image = item.attrs[3][1].split("'")[3]
	if(debug): print image
	images.append(image)

	if(debug): print images

	rss="""<rss version="2.0">

	<channel>
	<title>Traueranzeigen seit %s</title>
	<link>http://trauer.schwaebische.de/</link>
	<description>Traueranzeigen seit %s</description>
	""" % (yesterday.strftime("%d.%m.%Y"), yesterday.strftime("%d.%m.%Y"))

	for image in images:
	rss = rss + """
	<item>
	<title>Traueranzeige</title>
	<link>%s</link>
	<guid>%s</guid>
	</item>
	""" % (image, image)

	rss = rss + """
	</channel>

	</rss>"""



	with open("todesanzeigen.rss", "w") as myfile:
	myfile.write(rss)
No results found