Skip to content

Instantly share code, notes, and snippets.

@titanjer
Created February 16, 2015 12:09
Show Gist options
  • Save titanjer/958df064330227fabebd to your computer and use it in GitHub Desktop.
Save titanjer/958df064330227fabebd to your computer and use it in GitHub Desktop.
from scrapy.contrib.spiders import CrawlSpider
from scraper.items import ProductItem
from scrapy import log
import re
import json
# curl http://192.168.1.100:6800/schedule.json -d project=scraper -d spider=importjl -d setting=STORE=24 -d setting=RQ_QUEUE=scraper
class ImportjlSpider(CrawlSpider):
name = 'importjl'
start_urls = [
# 'http://10.8.0.1:6800/items/scraper/dwidigitalcameras.jl'
]
def parse(self, response):
self.log('> %s' % response.body, level=log.INFO)
for j in response.body.split('\n'):
self.log('> %s' % j, level=log.INFO)
item = ProductItem(json.loads(j))
# item['description'] = \
# item['description'].encode('ascii', 'ignore') \
# .decode('utf-8') \
# if item['description'] else ''
# item['product_spec'] = \
# item['product_spec'].encode('ascii', 'ignore') \
# .decode('utf-8') \
# if item['product_spec'] else ''
item['description'] = re.sub(r'\n+', '\n', item['description'])
self.log('> %s' % item['product_number'], level=log.INFO)
yield item
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment