Skip to content

Instantly share code, notes, and snippets.

@pwin
Last active December 29, 2015 07:39
Show Gist options
  • Save pwin/7637536 to your computer and use it in GitHub Desktop.
Save pwin/7637536 to your computer and use it in GitHub Desktop.
Scrapy spider for scotland.gov.uk
# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html
from scrapy.item import Item, Field
class Sg20131029Item(Item):
# define the fields for your item here like:
# name = Field()
label = Field()
referring = Field()
referring_url = Field()
link = Field()
title = Field()
description = Field()
summary = Field()
date = Field()
subject = Field()
type = Field()
isbn = Field()
keywords = Field()
metatitle = Field()
metadescription = Field()
import uuid
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from scrapy.contrib.spiders import Rule, CrawlSpider
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from SG_2013_10_29.items import Sg20131029Item
from pybloom import BloomFilter
bloom = BloomFilter(capacity=500000, error_rate=0.000001)
NAME = "SG"
ALLOWED = 'scotland.gov.uk'
DOMAIN = 'www.scotland.gov.uk'
URL = 'http://%s' % DOMAIN
class SGSpider(CrawlSpider):
name = NAME
allowed_domains = [ALLOWED]
start_urls = [
URL
]
rules = (Rule(SgmlLinkExtractor(allow=(r'.*', ),deny=(r'business.scotland.gov.*', ),unique=True, allow_domains=[ALLOWED]), callback='parse_item', follow=True), )
def parse_item(self, response):
hxs = HtmlXPathSelector(response)
urls = hxs.select('//a/@href').extract()
count = 0
item = []
for url in urls:
if not (url.startswith('http://') or url.startswith('https://') or url.startswith('ftp://')):
url1= URL + url
else:
url1 = url
if (url1.endswith('.xls') or url1.endswith('.xlsx') or url1.endswith('.xlsm')) and url1 not in bloom:
bloom.add(url1)
item = Sg20131029Item()
item['label'] = str(uuid.uuid4())
item['referring_url'] = str(response)[5:-1]
item['referring'] = "".join(hxs.select('//meta[@name="DC.Identifier"]/@content').extract())[5:]
item['link'] = url1
item['title'] = "".join(hxs.select('//table[@class="dg file"]/tr/th[contains(.,"Title")]/td/text()').extract())
item['metatitle'] = "".join(hxs.select('//meta[@name="title"]/@content').extract())
item['metadescription'] = "".join(hxs.select('//meta[@name="description"]/@content').extract())
item['description'] = "".join(hxs.select('//table[@class="dg file"]/tr/th[contains(.,"Description")]/td/text()').extract())
item['summary'] = ""
item['date'] = "".join(hxs.select('//meta[@name="DC.date.modified"]/@content').extract())
item['subject'] = "".join(hxs.select('//meta[@name="ScotGov.Topic"]/@content').extract()) if hxs.select('//meta[@name="ScotGov.Topic"]/@content') else "Unknown"
item['type'] = "".join(hxs.select('//meta[@name="DC.Type"]/@content').extract()) if hxs.select('//meta[@name="DC.Type"]/@content') else "Unknown"
item['isbn'] = "".join(hxs.select('//meta[@name="DC.identifier.ISBN"]/@content').extract())
item['keywords'] = [i.replace(",","") for i in hxs.select('//meta[@name="keywords"]/@content').extract()[0].split(",") if len(i)>1]
#items.append(item)
count += 1
print("stored ",count," items *************")
return item
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment