Skip to content

Instantly share code, notes, and snippets.

View nramirezuy's full-sized avatar

Nicolás Ramírez nramirezuy

  • Montevideo. Uruguay
View GitHub Profile
scrapy/contrib/pipeline/images.py:112: if self.IMAGES_RESULT_FIELD in item.fields:
scrapy/contrib/pipeline/files.py:270: if self.FILES_RESULT_FIELD in item.fields:
scrapy/contrib/loader/__init__.py:122: value = self.item.fields[field_name].get(key, default)
scrapy/commands/parse.py:110: if isinstance(x, BaseItem):
scrapy/contracts/default.py:86: if isinstance(x, BaseItem):
scrapy/contrib/spiders/feed.py:129: if isinstance(ret, (BaseItem, Request)):
scrapy/contrib/exporter/__init__.py:243: if isinstance(value, BaseItem):
scrapy/contrib/loader/__init__.py:121: if isinstance(self.item, Item):
scrapy/core/scraper.py:177: elif isinstance(output, BaseItem):
def dictpath(dct, path):
"""Resolve dictpath
>>> r = {'also_viewed': ['url1', 'url2']}
>>> list(dictpath(r, 'also_viewed'))
['url1', 'url2']
>>> r = {'related': [{'url': 'url1'}, {'url': 'url2'}]}
>>> list(dictpath(r, 'related:url'))
['url1', 'url2']
>>> r = {'related': [{'urls': ['url1', 'url2']}, {'urls': ['url3', 'url4']}]}
>>> list(dictpath(r, 'related:urls'))
from scrapy.spider import Spider
from scrapy import log
class DummySpider(Spider):
name = "dummy"
allowed_domains = ["example.com", "iana.org"]
start_urls = (
'http://www.example.com/',
)
import collections, json
from urllib import urlretrieve
from urlparse import urljoin
from csv import DictReader, reader as csv_reader
import scrapinghub
from project.settings import SH_APIKEY
LOG 1
=====
scrapy crawl state -s JOBDIR=test
/home/scrapinghub/Devel/testspiders/testspiders/spiders/dummy.py:3: ScrapyDeprecationWarning: testspiders.spiders.dummy.DummySpider inherits from deprecated class scrapy.spider.BaseSpider, please inherit from scrapy.spider.Spider. (warning only on first subclass, there may be others)
class DummySpider(BaseSpider):
/home/scrapinghub/Devel/scrapy/scrapy/contrib/linkextractors/sgml.py:106: ScrapyDeprecationWarning: SgmlLinkExtractor is deprecated and will be removed in future releases. Please use scrapy.contrib.linkextractors.LinkExtractor
ScrapyDeprecationWarning
2014-08-21 14:30:41-0300 [scrapy] INFO: Scrapy 0.25.1 started (bot: testspiders)
2014-08-21 14:30:41-0300 [scrapy] INFO: Optional features available: ssl, http11, boto
2014-08-21 14:30:41-0300 [scrapy] INFO: Overridden settings: {'CLOSESPIDER_TIMEOUT': 3600, 'CLOSESPIDER_PAGECOUNT': 1000, 'SPIDER_MODULES': ['testspiders.spiders'], 'NEWSPIDER_MODULE': 'testspiders.spiders', 'BOT_NAME': 'testspiders'}
from scrapy.spider import Spider
from scrapy.http import Request
class StateSpider(Spider):
name = 'state'
def start_requests(self):
print 'State:', getattr(self, 'state', None)
yield Request('http://example.com')
def parse_category(self, response):
item = Item()
item['category'] = get_category(response)
for url in get_product_urls(response):
yield Request(url, callback=self.parse_product, meta={'item': item.copy()])
def parse_product(self, response):
item = response.meta.get('item', {})
...
return item
from scrapy.spider import Spider
class SeveralNamesSpider(Spider):
def start_requests(self):
print 'name: {}, start_urls: {}'.format(self.name, self.start_urls)
for name, start_urls in (('name1', ('url1', )), ('name2', ('url2', ))):
diff --git a/scrapy/selector/unified.py b/scrapy/selector/unified.py
index b8a3678..6ce1de1 100644
--- a/scrapy/selector/unified.py
+++ b/scrapy/selector/unified.py
@@ -46,10 +46,40 @@ def _response_from_text(text, st):
body=unicode_to_str(text, 'utf-8'))
+import threading
+data = threading.local()
diff --git a/scrapy/selector/unified.py b/scrapy/selector/unified.py
index b8a3678..28d8ac9 100644
--- a/scrapy/selector/unified.py
+++ b/scrapy/selector/unified.py
@@ -46,6 +46,38 @@ def _response_from_text(text, st):
body=unicode_to_str(text, 'utf-8'))
+class SelectorContext(object):
+