Created
May 21, 2012 09:08
-
-
Save rikva/2761356 to your computer and use it in GitHub Desktop.
Don't Judge This Code
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from scrapy.contrib.spiders import CrawlSpider, Rule | |
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor | |
from scrapy.selector import HtmlXPathSelector | |
from kamertje.items import KamertjeItem | |
import sqlite3 as sqlite | |
from scrapy import log | |
class KamertjeSpider(CrawlSpider): | |
name = "kamertje" | |
allowed_domains = ["www.kamertje.nl", "kamertje.nl"] | |
start_urls = ["http://www.kamertje.nl/"] | |
rules = ( | |
Rule(SgmlLinkExtractor(allow=('kamers_in', 'index.php'),unique=True)), | |
Rule(SgmlLinkExtractor(allow=('kamers\/[0-9]'),unique=True), callback='parse_item'), | |
) | |
def parse_item(self, response): | |
self.log('Found kamer page: %s' % response.url) | |
# check for duplicates in DB right here | |
# this is not very efficient disk/cpu-wise but saves bandwidth. | |
# TODO: find a way to open a connection once per crawl session. | |
self.connection = sqlite.connect('../scrapedata.db') | |
self.cursor = self.connection.cursor() | |
# Create SQL tables here instead of in pipeline because otherwise they'll never be created. | |
# create kamertje table | |
self.cursor.execute('CREATE TABLE IF NOT EXISTS kamertje (' | |
'url text PRIMARY KEY, ' | |
'title text, ' | |
'street text, ' | |
'city text, ' | |
'description text,' | |
'price text,' | |
'priceType text,' | |
'size text,' | |
'type text,' | |
'date_added datetime)') | |
# create images table | |
self.cursor.execute('CREATE TABLE IF NOT EXISTS kamertje_img (' | |
'kamertje_url text,' | |
'image_url text,' | |
'image_file text)') | |
url = (response.url.lower(),) | |
self.cursor.execute('select * from kamertje where url=?', url) | |
result = self.cursor.fetchone() | |
if result: | |
log.msg("Item is already found in database : %s" % url) | |
else: | |
# item not found in DB, so continue. | |
hxs = HtmlXPathSelector(response) | |
kamerItem = KamertjeItem() | |
kamerItem['url'] = response.url.lower() #must be lowercased because of duplicates - kamertje sucks | |
kamerItem['street'] = hxs.select("//*/tr[@class='tableheader']/td/table/tr/td/h1/b/font/text()")[0].extract().lower() | |
kamerItem['city'] = hxs.select("//*/tr[@class='tableheader']/td/table/tr/td/h1/b/a/text()")[0].extract().split(' in ')[1].lower().strip() | |
# can be empty | |
try: | |
kamerItem['title'] = hxs.select("//*/td[contains(.,'beschrijving')]/../td[2]/b/text()")[0].extract() | |
except: | |
kamerItem['title'] = '' | |
kamerItem['price'] = hxs.select("//*/td[contains(.,'kamerhuur')]/../td[2]/b/text()")[0].extract().split()[1].split(',')[0] | |
kamerItem['priceType'] = hxs.select("//*/td[contains(.,'kamerhuur')]/../td[2]/text()")[0].extract().strip() | |
kamerItem['size'] = hxs.select("//*/td[contains(.,'Oppervlakte')]/../td[2]/b/text()")[0].extract().split()[0] | |
kamerItem['type'] = hxs.select("//*/td[contains(.,'Soort')]/../td[2]/b/text()")[0].extract() | |
# can be empty | |
try: | |
kamerItem['description'] = hxs.select("//*/td[contains(.,'Omschrijving')]/../td[2]/text()")[0].extract() | |
except: | |
kamerItem['description'] = '' | |
kamerItem['image_urls'] = hxs.select("//*/img/@src[contains(.,'imagesroom')]").extract() | |
kamerItem['images'] = '' # neccesary | |
# we want full images, not thumbs | |
for subItem in kamerItem['image_urls']: | |
kamerItem['image_urls'][kamerItem['image_urls'].index(subItem)] = "http://kamertje.nl/" + subItem.replace('room/t','room/') | |
return kamerItem |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment