Created
July 21, 2017 03:02
-
-
Save yaochao/fb17a3c6b2da00d8f94188db268ff222 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
import sqlite3 | |
import time | |
import scrapy | |
#################################### commercespider ########################### | |
class SocialbetaSpider(scrapy.Spider): | |
name = 'socialbeta' | |
allowed_domains = ['socialbeta.com'] | |
start_urls = ['http://socialbeta.com/tag/案例'] | |
custom_settings = { | |
'ITEM_PIPELINES': { | |
'datapark.pipelines.BrandMongoPipeline': 300, | |
'datapark.pipelines.BrandKafkaPipeline': 301, | |
} | |
} | |
# 自定义属性 | |
first_url = '' | |
connection = sqlite3.connect('data.sqlite') | |
cursor = connection.cursor() | |
cursor.execute('CREATE TABLE IF NOT EXISTS %s (latest_url TEXT)' % name) | |
connection.commit() | |
cursor.execute('SELECT latest_url FROM %s' % name) | |
latest_url = cursor.fetchone() | |
if latest_url: | |
latest_url = latest_url[0] | |
def parse(self, response): | |
posts = response.xpath('//*[@class="postimg"]/li') | |
for post in posts: | |
post_url = response.urljoin(post.xpath('div/div/h3/a/@href').extract_first()) | |
post_title = post.xpath('div/div/h3/a/text()').extract_first() | |
item = { | |
'_id': post_url, | |
'post_url': post_url, | |
'post_title': post_title | |
} | |
# 把第一条数据作为最新的数据,存储到sqlite中 | |
if not self.first_url: | |
self.first_url = post_url | |
self.cursor.execute('DELETE FROM %s' % self.name) | |
self.cursor.execute('INSERT INTO %s (latest_url) VALUES ("%s")' % (self.name, self.first_url)) | |
self.connection.commit() | |
# 从sqlite中取出上一次最新的数据,与本次的数据做对比,如果相同则认为文章抓到了上次已经抓过的数据,如果不同则认为文章还没有抓完 | |
if post_url == self.latest_url: | |
print '%s - 爬到了上次爬到的地方' % self.name | |
self.connection.close() | |
return | |
request = scrapy.Request(url=post_url, callback=self.parse_post) | |
request.meta['item'] = item | |
yield request | |
def parse_post(self, response): | |
item = response.meta['item'] | |
content_text = ''.join( | |
response.xpath('//div[@class="content"]//text()[normalize-space() and not(ancestor::script | ancestor::style)]').extract()) | |
content_html = response.xpath('//div[@class="content"]').extract_first() | |
item['content_text'] = content_text.replace('\r', '').replace('\n', '').replace('\t', '') | |
item['content_html'] = content_html | |
item['crawl_time'] = int(time.time()) | |
item['site_name'] = self.name | |
item['type'] = 'commerce' | |
item['module'] = 'brand' | |
yield item | |
class QdailySpider(scrapy.Spider): | |
name = 'qdaily' | |
allowed_domains = ['qdaily.com'] | |
start_urls = ['http://www.qdaily.com/categories/18.html/'] | |
custom_settings = { | |
'ITEM_PIPELINES': { | |
'datapark.pipelines.BrandMongoPipeline': 300, | |
'datapark.pipelines.BrandKafkaPipeline': 301, | |
} | |
} | |
# 自定义属性 | |
first_url = '' | |
connection = sqlite3.connect('data.sqlite') | |
cursor = connection.cursor() | |
cursor.execute('CREATE TABLE IF NOT EXISTS %s (latest_url TEXT)' % name) | |
connection.commit() | |
cursor.execute('SELECT latest_url FROM %s' % name) | |
latest_url = cursor.fetchone() | |
if latest_url: | |
latest_url = latest_url[0] | |
def parse(self, response): | |
posts = response.xpath('//*[@class="packery-container articles"]/div') | |
for post in posts: | |
post_url = response.urljoin(post.xpath('a/@href').extract_first()) | |
post_title = post.xpath('a/div/div/img/@alt').extract_first() | |
item = { | |
'_id': post_url, | |
'post_url': post_url, | |
'post_title': post_title | |
} | |
# 把第一条数据作为最新的数据,存储到sqlite中 | |
if not self.first_url: | |
self.first_url = post_url | |
self.cursor.execute('DELETE FROM %s' % self.name) | |
self.cursor.execute('INSERT INTO %s (latest_url) VALUES ("%s")' % (self.name, self.first_url)) | |
self.connection.commit() | |
# 从sqlite中取出上一次最新的数据,与本次的数据做对比,如果相同则认为文章抓到了上次已经抓过的数据,如果不同则认为文章还没有抓完 | |
if post_url == self.latest_url: | |
print '%s - 爬到了上次爬到的地方' % self.name | |
self.connection.close() | |
return | |
request = scrapy.Request(url=post_url, callback=self.parse_post) | |
request.meta['item'] = item | |
yield request | |
def parse_post(self, response): | |
item = response.meta['item'] | |
# 文章的布局分为两种情况,根据文章中的元素做出解析(xpath)的选择。 | |
if response.xpath('//div[@class="main long-article"]'): | |
content_text = ''.join( | |
response.xpath('//div[@class="main long-article"]//text()[normalize-space() and not(ancestor::script | ancestor::style)]').extract()) | |
content_html = response.xpath('//div[@class="main long-article"]').extract_first() | |
else: | |
content_text = ''.join( | |
response.xpath('//div[@class="detail"]//text()[normalize-space() and not(ancestor::script | ancestor::style)]').extract()) | |
content_html = response.xpath('//div[@class="detail"]').extract_first() | |
item['content_text'] = content_text.replace('\r', '').replace('\n', '').replace('\t', '') | |
item['content_html'] = content_html | |
item['crawl_time'] = int(time.time()) | |
item['site_name'] = self.name | |
item['type'] = 'commerce' | |
item['module'] = 'brand' | |
yield item | |
class JiemianSpider(scrapy.Spider): | |
name = 'jiemian' | |
start_urls = ['http://www.jiemian.com/lists/49.html'] | |
custom_settings = { | |
'ITEM_PIPELINES': { | |
'datapark.pipelines.BrandMongoPipeline': 300, | |
'datapark.pipelines.BrandKafkaPipeline': 301, | |
} | |
} | |
# 自定义属性 | |
first_url = '' | |
connection = sqlite3.connect('data.sqlite') | |
cursor = connection.cursor() | |
cursor.execute('CREATE TABLE IF NOT EXISTS %s (latest_url TEXT)' % name) | |
connection.commit() | |
cursor.execute('SELECT latest_url FROM %s' % name) | |
latest_url = cursor.fetchone() | |
if latest_url: | |
latest_url = latest_url[0] | |
def parse(self, response): | |
posts = response.xpath('//*[@id="load-list"]/div') | |
for post in posts: | |
post_url = response.urljoin( | |
post.xpath('div[@class="news-right"]/div[@class="news-header"]/h3/a/@href').extract_first()) | |
post_title = post.xpath('div[@class="news-right"]/div[@class="news-header"]/h3/a/text()').extract_first() | |
item = { | |
'_id': post_url, | |
'post_url': post_url, | |
'post_title': post_title | |
} | |
# 把第一条数据作为最新的数据,存储到sqlite中 | |
if not self.first_url: | |
self.first_url = post_url | |
self.cursor.execute('DELETE FROM %s' % self.name) | |
self.cursor.execute('INSERT INTO %s (latest_url) VALUES ("%s")' % (self.name, self.first_url)) | |
self.connection.commit() | |
# 从sqlite中取出上一次最新的数据,与本次的数据做对比,如果相同则认为文章抓到了上次已经抓过的数据,如果不同则认为文章还没有抓完 | |
if post_url == self.latest_url: | |
print '%s - 爬到了上次爬到的地方' % self.name | |
self.connection.close() | |
return | |
request = scrapy.Request(url=post_url, callback=self.parse_post) | |
request.meta['item'] = item | |
yield request | |
def parse_post(self, response): | |
item = response.meta['item'] | |
content_text = ''.join( | |
response.xpath('//div[@class="article-main"]//text()[normalize-space() and not(ancestor::script | ancestor::style)]').extract()) | |
content_html = response.xpath('//div[@class="article-main"]').extract_first() | |
item['content_text'] = content_text.replace('\r', '').replace('\n', '').replace('\t', '') | |
item['content_html'] = content_html | |
item['crawl_time'] = int(time.time()) | |
item['site_name'] = self.name | |
item['type'] = 'commerce' | |
item['module'] = 'brand' | |
yield item | |
class ToodaylabSpider(scrapy.Spider): | |
name = 'toodaylab' | |
start_urls = ['http://www.toodaylab.com/field/308'] | |
custom_settings = { | |
'ITEM_PIPELINES': { | |
'datapark.pipelines.BrandMongoPipeline': 300, | |
'datapark.pipelines.BrandKafkaPipeline': 301, | |
} | |
} | |
# 自定义属性 | |
first_url = '' | |
connection = sqlite3.connect('data.sqlite') | |
cursor = connection.cursor() | |
cursor.execute('CREATE TABLE IF NOT EXISTS %s (latest_url TEXT)' % name) | |
connection.commit() | |
cursor.execute('SELECT latest_url FROM %s' % name) | |
latest_url = cursor.fetchone() | |
if latest_url: | |
latest_url = latest_url[0] | |
def parse(self, response): | |
posts = response.xpath('//*[@class="content"]/div') | |
for post in posts: | |
post_url = response.urljoin(post.xpath('div[@class="post-info"]/p/a/@href').extract_first()) | |
post_title = post.xpath('div[@class="post-info"]/p/a/text()').extract_first() | |
item = { | |
'_id': post_url, | |
'post_url': post_url, | |
'post_title': post_title | |
} | |
# 把第一条数据作为最新的数据,存储到sqlite中 | |
if not self.first_url: | |
self.first_url = post_url | |
self.cursor.execute('DELETE FROM %s' % self.name) | |
self.cursor.execute('INSERT INTO %s (latest_url) VALUES ("%s")' % (self.name, self.first_url)) | |
self.connection.commit() | |
# 从sqlite中取出上一次最新的数据,与本次的数据做对比,如果相同则认为文章抓到了上次已经抓过的数据,如果不同则认为文章还没有抓完 | |
if post_url == self.latest_url: | |
print '%s - 爬到了上次爬到的地方' % self.name | |
self.connection.close() | |
return | |
request = scrapy.Request(url=post_url, callback=self.parse_post) | |
request.meta['item'] = item | |
yield request | |
def parse_post(self, response): | |
item = response.meta['item'] | |
content_text = ''.join( | |
response.xpath('//div[@class="post-content"]//text()[normalize-space() and not(ancestor::script | ancestor::style)]').extract()) | |
content_html = response.xpath('//div[@class="post-content"]').extract_first() | |
item['content_text'] = content_text.replace('\r', '').replace('\n', '').replace('\t', '') | |
item['content_html'] = content_html | |
item['crawl_time'] = int(time.time()) | |
item['site_name'] = self.name | |
item['type'] = 'commerce' | |
item['module'] = 'brand' | |
yield item | |
class MadisonboomSpider(scrapy.Spider): | |
name = 'madisonboom' | |
start_urls = ['http://www.madisonboom.com/category/works/'] | |
custom_settings = { | |
'ITEM_PIPELINES': { | |
'datapark.pipelines.BrandMongoPipeline': 300, | |
'datapark.pipelines.BrandKafkaPipeline': 301, | |
} | |
} | |
# 自定义属性 | |
first_url = '' | |
connection = sqlite3.connect('data.sqlite') | |
cursor = connection.cursor() | |
cursor.execute('CREATE TABLE IF NOT EXISTS %s (latest_url TEXT)' % name) | |
connection.commit() | |
cursor.execute('SELECT latest_url FROM %s' % name) | |
latest_url = cursor.fetchone() | |
if latest_url: | |
latest_url = latest_url[0] | |
def parse(self, response): | |
posts = response.xpath('//*[@id="gallery_list_elements"]/li') | |
for post in posts: | |
post_url = response.urljoin(post.xpath('h3/a/@href').extract_first()) | |
post_title = post.xpath('h3/p/a/text()').extract_first() | |
item = { | |
'_id': post_url, | |
'post_url': post_url, | |
'post_title': post_title | |
} | |
# 把第一条数据作为最新的数据,存储到sqlite中 | |
if not self.first_url: | |
self.first_url = post_url | |
self.cursor.execute('DELETE FROM %s' % self.name) | |
self.cursor.execute('INSERT INTO %s (latest_url) VALUES ("%s")' % (self.name, self.first_url)) | |
self.connection.commit() | |
# 从sqlite中取出上一次最新的数据,与本次的数据做对比,如果相同则认为文章抓到了上次已经抓过的数据,如果不同则认为文章还没有抓完 | |
if post_url == self.latest_url: | |
print '%s - 爬到了上次爬到的地方' % self.name | |
self.connection.close() | |
return | |
request = scrapy.Request(url=post_url, callback=self.parse_post) | |
request.meta['item'] = item | |
yield request | |
def parse_post(self, response): | |
item = response.meta['item'] | |
content_text = ''.join( | |
response.xpath('//div[@class="slide-info"]//text()[normalize-space() and not(ancestor::script | ancestor::style)]').extract()) | |
content_html = response.xpath('//div[@class="slide-info"]').extract_first() | |
item['content_text'] = content_text.replace('\r', '').replace('\n', '').replace('\t', '') | |
item['content_html'] = content_html | |
item['crawl_time'] = int(time.time()) | |
item['site_name'] = self.name | |
item['type'] = 'commerce' | |
item['module'] = 'brand' | |
yield item | |
class IwebadSpider(scrapy.Spider): | |
name = 'iwebad' | |
start_urls = ['http://iwebad.com/'] | |
custom_settings = { | |
'ITEM_PIPELINES': { | |
'datapark.pipelines.BrandMongoPipeline': 300, | |
'datapark.pipelines.BrandKafkaPipeline': 301, | |
} | |
} | |
# 自定义属性 | |
first_url = '' | |
connection = sqlite3.connect('data.sqlite') | |
cursor = connection.cursor() | |
cursor.execute('CREATE TABLE IF NOT EXISTS %s (latest_url TEXT)' % name) | |
connection.commit() | |
cursor.execute('SELECT latest_url FROM %s' % name) | |
latest_url = cursor.fetchone() | |
if latest_url: | |
latest_url = latest_url[0] | |
def parse(self, response): | |
posts = response.xpath('//*[@class="new_search_works"]/div') | |
for post in posts: | |
post_url = response.urljoin(post.xpath('div[@class="works_info"]/h4/span/a/@href').extract_first()) | |
post_title = post.xpath('div[@class="works_info"]/h4/span/a/text()').extract_first() | |
item = { | |
'_id': post_url, | |
'post_url': post_url, | |
'post_title': post_title | |
} | |
# 把第一条数据作为最新的数据,存储到sqlite中 | |
if not self.first_url: | |
self.first_url = post_url | |
self.cursor.execute('DELETE FROM %s' % self.name) | |
self.cursor.execute('INSERT INTO %s (latest_url) VALUES ("%s")' % (self.name, self.first_url)) | |
self.connection.commit() | |
# 从sqlite中取出上一次最新的数据,与本次的数据做对比,如果相同则认为文章抓到了上次已经抓过的数据,如果不同则认为文章还没有抓完 | |
if post_url == self.latest_url: | |
print '%s - 爬到了上次爬到的地方' % self.name | |
self.connection.close() | |
return | |
request = scrapy.Request(url=post_url, callback=self.parse_post) | |
request.meta['item'] = item | |
yield request | |
def parse_post(self, response): | |
item = response.meta['item'] | |
content_text = ''.join( | |
response.xpath('//div[@class="news_ckkk "]//text()[normalize-space() and not(ancestor::script | ancestor::style)]').extract()) | |
content_html = response.xpath('//div[@class="news_ckkk "]').extract_first() | |
item['content_text'] = content_text.replace('\r', '').replace('\n', '').replace('\t', '') | |
item['content_html'] = content_html | |
item['crawl_time'] = int(time.time()) | |
item['site_name'] = self.name | |
item['type'] = 'commerce' | |
item['module'] = 'brand' | |
yield item | |
class AdquanSpider(scrapy.Spider): | |
name = 'adquan' | |
start_urls = ['http://www.adquan.com/'] | |
custom_settings = { | |
'ITEM_PIPELINES': { | |
'datapark.pipelines.BrandMongoPipeline': 300, | |
'datapark.pipelines.BrandKafkaPipeline': 301, | |
} | |
} | |
# 自定义属性 | |
first_url = '' | |
connection = sqlite3.connect('data.sqlite') | |
cursor = connection.cursor() | |
cursor.execute('CREATE TABLE IF NOT EXISTS %s (latest_url TEXT)' % name) | |
connection.commit() | |
cursor.execute('SELECT latest_url FROM %s' % name) | |
latest_url = cursor.fetchone() | |
if latest_url: | |
latest_url = latest_url[0] | |
def parse(self, response): | |
posts = response.xpath('//*[@class="work_list_left"]/div') | |
for post in posts: | |
post_url = response.urljoin(post.xpath('h2/a/@href').extract_first()) | |
post_title = post.xpath('h2/a/text()').extract_first() | |
item = { | |
'_id': post_url, | |
'post_url': post_url, | |
'post_title': post_title | |
} | |
# 把第一条数据作为最新的数据,存储到sqlite中 | |
if not self.first_url: | |
self.first_url = post_url | |
self.cursor.execute('DELETE FROM %s' % self.name) | |
self.cursor.execute('INSERT INTO %s (latest_url) VALUES ("%s")' % (self.name, self.first_url)) | |
self.connection.commit() | |
# 从sqlite中取出上一次最新的数据,与本次的数据做对比,如果相同则认为文章抓到了上次已经抓过的数据,如果不同则认为文章还没有抓完 | |
if post_url == self.latest_url: | |
print '%s - 爬到了上次爬到的地方' % self.name | |
self.connection.close() | |
return | |
request = scrapy.Request(url=post_url, callback=self.parse_post) | |
request.meta['item'] = item | |
yield request | |
def parse_post(self, response): | |
item = response.meta['item'] | |
content_text = ''.join( | |
response.xpath('//div[@class="deta_inner"]//text()[normalize-space() and not(ancestor::script | ancestor::style)]').extract()) | |
content_html = response.xpath('//div[@class="deta_inner"]').extract_first() | |
if not content_text: | |
content_text = ''.join( | |
response.xpath('//text()[normalize-space() and not(ancestor::script | ancestor::style)]').extract()) | |
content_html = response.xpath('//div[@class="con_Text"]').extract_first() | |
item['content_text'] = content_text.replace('\r', '').replace('\n', '').replace('\t', '') | |
item['content_html'] = content_html | |
item['crawl_time'] = int(time.time()) | |
item['site_name'] = self.name | |
item['type'] = 'commerce' | |
item['module'] = 'brand' | |
yield item | |
class DigitalingSpider(scrapy.Spider): | |
name = 'digitaling' | |
start_urls = ['http://www.digitaling.com/projects'] | |
custom_settings = { | |
'ITEM_PIPELINES': { | |
'datapark.pipelines.BrandMongoPipeline': 300, | |
'datapark.pipelines.BrandKafkaPipeline': 301, | |
} | |
} | |
# 自定义属性 | |
first_url = '' | |
connection = sqlite3.connect('data.sqlite') | |
cursor = connection.cursor() | |
cursor.execute('CREATE TABLE IF NOT EXISTS %s (latest_url TEXT)' % name) | |
connection.commit() | |
cursor.execute('SELECT latest_url FROM %s' % name) | |
latest_url = cursor.fetchone() | |
if latest_url: | |
latest_url = latest_url[0] | |
def parse(self, response): | |
posts = response.xpath('//div[@id="pro_list"]/div') | |
for post in posts: | |
post_url = response.urljoin(post.xpath('div[@class="works_bd"]/div/h3/a/@href').extract_first()) | |
post_title = post.xpath('div[@class="works_bd"]/div/h3/a/text()').extract_first() | |
item = { | |
'_id': post_url, | |
'post_url': post_url, | |
'post_title': post_title | |
} | |
# 把第一条数据作为最新的数据,存储到sqlite中 | |
if not self.first_url: | |
self.first_url = post_url | |
self.cursor.execute('DELETE FROM %s' % self.name) | |
self.cursor.execute('INSERT INTO %s (latest_url) VALUES ("%s")' % (self.name, self.first_url)) | |
self.connection.commit() | |
# 从sqlite中取出上一次最新的数据,与本次的数据做对比,如果相同则认为文章抓到了上次已经抓过的数据,如果不同则认为文章还没有抓完 | |
if post_url == self.latest_url: | |
print '%s - 爬到了上次爬到的地方' % self.name | |
self.connection.close() | |
return | |
request = scrapy.Request(url=post_url, callback=self.parse_post) | |
request.meta['item'] = item | |
yield request | |
def parse_post(self, response): | |
item = response.meta['item'] | |
content_text = ''.join( | |
response.xpath('//div[@id="article_con"]//text()[normalize-space() and not(ancestor::script | ancestor::style)]').extract()) | |
content_html = response.xpath('//div[@id="article_con"]').extract_first() | |
item['content_text'] = content_text.replace('\r', '').replace('\n', '').replace('\t', '') | |
item['content_html'] = content_html | |
item['crawl_time'] = int(time.time()) | |
item['site_name'] = self.name | |
item['type'] = 'commerce' | |
item['module'] = 'brand' | |
yield item | |
class IresearchSpider(scrapy.Spider): | |
name = 'iresearch' | |
start_urls = ['http://a.iresearch.cn/'] | |
custom_settings = { | |
'ITEM_PIPELINES': { | |
# 'datapark.pipelines.BrandMongoPipeline': 300, | |
# 'datapark.pipelines.BrandKafkaPipeline': 301, | |
} | |
} | |
# 自定义属性 | |
first_url = '' | |
connection = sqlite3.connect('data.sqlite') | |
cursor = connection.cursor() | |
cursor.execute('CREATE TABLE IF NOT EXISTS %s (latest_url TEXT)' % name) | |
connection.commit() | |
cursor.execute('SELECT latest_url FROM %s' % name) | |
latest_url = cursor.fetchone() | |
if latest_url: | |
latest_url = latest_url[0] | |
def parse(self, response): | |
posts = response.xpath('//div[@id="tab-list"]/div/ul/li') | |
for post in posts: | |
post_url = response.urljoin(post.xpath('h3/a/@href').extract_first()) | |
post_title = post.xpath('h3/a/text()').extract_first() | |
item = { | |
'_id': post_url, | |
'post_url': post_url, | |
'post_title': post_title | |
} | |
# 把第一条数据作为最新的数据,存储到sqlite中 | |
if not self.first_url: | |
self.first_url = post_url | |
self.cursor.execute('DELETE FROM %s' % self.name) | |
self.cursor.execute('INSERT INTO %s (latest_url) VALUES ("%s")' % (self.name, self.first_url)) | |
self.connection.commit() | |
# 从sqlite中取出上一次最新的数据,与本次的数据做对比,如果相同则认为文章抓到了上次已经抓过的数据,如果不同则认为文章还没有抓完 | |
if post_url == self.latest_url: | |
print '%s - 爬到了上次爬到的地方' % self.name | |
self.connection.close() | |
return | |
request = scrapy.Request(url=post_url, callback=self.parse_post) | |
request.meta['item'] = item | |
yield request | |
def parse_post(self, response): | |
item = response.meta['item'] | |
content_text = ''.join( | |
response.xpath('//div[@class="m-article"]//text()[normalize-space() and not(ancestor::script | ancestor::style)]').extract()) | |
content_html = response.xpath('//div[@class="m-article"]').extract_first() | |
item['content_text'] = content_text.replace('\r', '').replace('\n', '').replace('\t', '') | |
item['content_html'] = content_html | |
item['crawl_time'] = int(time.time()) | |
item['site_name'] = self.name | |
item['type'] = 'commerce' | |
item['module'] = 'brand' | |
print item['content_text'], 'content_text' | |
print item['content_html'] | |
class EbrunSpider(scrapy.Spider): | |
name = 'ebrun' | |
start_urls = ['http://www.ebrun.com/brands/'] | |
custom_settings = { | |
'ITEM_PIPELINES': { | |
'datapark.pipelines.BrandMongoPipeline': 300, | |
'datapark.pipelines.BrandKafkaPipeline': 301, | |
} | |
} | |
# 自定义属性 | |
first_url = '' | |
connection = sqlite3.connect('data.sqlite') | |
cursor = connection.cursor() | |
cursor.execute('CREATE TABLE IF NOT EXISTS %s (latest_url TEXT)' % name) | |
connection.commit() | |
cursor.execute('SELECT latest_url FROM %s' % name) | |
latest_url = cursor.fetchone() | |
if latest_url: | |
latest_url = latest_url[0] | |
def parse(self, response): | |
posts = response.xpath('//div[@id="create10"]/div/div') | |
for post in posts: | |
post_url = response.urljoin(post.xpath('p/span/a/@href').extract_first()) | |
post_title = post.xpath('p/span/a/text()').extract_first() | |
item = { | |
'_id': post_url, | |
'post_url': post_url, | |
'post_title': post_title | |
} | |
# 把第一条数据作为最新的数据,存储到sqlite中 | |
if not self.first_url: | |
self.first_url = post_url | |
self.cursor.execute('DELETE FROM %s' % self.name) | |
self.cursor.execute('INSERT INTO %s (latest_url) VALUES ("%s")' % (self.name, self.first_url)) | |
self.connection.commit() | |
# 从sqlite中取出上一次最新的数据,与本次的数据做对比,如果相同则认为文章抓到了上次已经抓过的数据,如果不同则认为文章还没有抓完 | |
if post_url == self.latest_url: | |
print '%s - 爬到了上次爬到的地方' % self.name | |
self.connection.close() | |
return | |
request = scrapy.Request(url=post_url, callback=self.parse_post) | |
request.meta['item'] = item | |
yield request | |
def parse_post(self, response): | |
item = response.meta['item'] | |
content_text = ''.join( | |
response.xpath('//div[@class="clearfix cmsDiv"]//text()[normalize-space() and not(ancestor::script | ancestor::style)]').extract()) | |
content_html = response.xpath('//div[@class="clearfix cmsDiv"]').extract_first() | |
item['content_text'] = content_text.replace('\r', '').replace('\n', '').replace('\t', '') | |
item['content_html'] = content_html | |
item['crawl_time'] = int(time.time()) | |
item['site_name'] = self.name | |
item['type'] = 'commerce' | |
item['module'] = 'brand' | |
yield item | |
#################################### conferencespider ########################### | |
class Eshow365Spider(scrapy.Spider): | |
name = 'eshow365' | |
start_urls = ['http://www.eshow365.com/zhanhui/0-0-0-0/0/%E5%B9%BF%E5%91%8A%20%E8%90%A5%E9%94%80'] | |
custom_settings = { | |
'ITEM_PIPELINES': { | |
'datapark.pipelines.BrandMongoPipeline': 300, | |
'datapark.pipelines.BrandKafkaPipeline': 301, | |
} | |
} | |
# 自定义属性 | |
first_url = '' | |
connection = sqlite3.connect('data.sqlite') | |
cursor = connection.cursor() | |
cursor.execute('CREATE TABLE IF NOT EXISTS %s (latest_url TEXT)' % name) | |
connection.commit() | |
cursor.execute('SELECT latest_url FROM %s' % name) | |
latest_url = cursor.fetchone() | |
if latest_url: | |
latest_url = latest_url[0] | |
def parse(self, response): | |
posts = response.xpath('//div[@class="sslist"]') | |
for post in posts: | |
post_url = response.urljoin(post.xpath('p[@class="zhtitle"]/a/@href').extract_first()) | |
post_title = post.xpath('p[@class="zhtitle"]/a//text()').extract() | |
post_title = ''.join(post_title).strip() | |
item = { | |
'_id': post_url, | |
'post_url': post_url, | |
'post_title': post_title | |
} | |
# 把第一条数据作为最新的数据,存储到sqlite中 | |
if not self.first_url: | |
self.first_url = post_url | |
self.cursor.execute('DELETE FROM %s' % self.name) | |
self.cursor.execute('INSERT INTO %s (latest_url) VALUES ("%s")' % (self.name, self.first_url)) | |
self.connection.commit() | |
# 从sqlite中取出上一次最新的数据,与本次的数据做对比,如果相同则认为文章抓到了上次已经抓过的数据,如果不同则认为文章还没有抓完 | |
if post_url == self.latest_url: | |
print '%s - 爬到了上次爬到的地方' % self.name | |
self.connection.close() | |
return | |
request = scrapy.Request(url=post_url, callback=self.parse_post) | |
request.meta['item'] = item | |
yield request | |
def parse_post(self, response): | |
item = response.meta['item'] | |
ps = response.xpath('//div[@class="zhxxcontent"]/p') | |
conference_time = '' | |
conference_address = '' | |
for index, p in enumerate(ps): | |
txt = p.xpath('string(.)').extract_first() | |
if u'举办时间' in txt: | |
conference_time = txt.split(u'举办时间:')[-1] | |
if u'举办展馆' in txt: | |
conference_address = txt.split(u'举办展馆:')[-1] | |
item['conference_time'] = conference_time | |
item['conference_address'] = conference_address | |
item['crawl_time'] = int(time.time()) | |
item['site_name'] = self.name | |
item['type'] = 'conference' | |
item['module'] = 'brand' | |
yield item | |
class Events_ireasearchSpider(scrapy.Spider): | |
name = 'events_ireasearch' | |
start_urls = ['http://events.iresearch.cn/'] | |
custom_settings = { | |
'ITEM_PIPELINES': { | |
'datapark.pipelines.BrandMongoPipeline': 300, | |
'datapark.pipelines.BrandKafkaPipeline': 301, | |
} | |
} | |
# 自定义属性 | |
first_url = '' | |
connection = sqlite3.connect('data.sqlite') | |
cursor = connection.cursor() | |
cursor.execute('CREATE TABLE IF NOT EXISTS %s (latest_url TEXT)' % name) | |
connection.commit() | |
cursor.execute('SELECT latest_url FROM %s' % name) | |
latest_url = cursor.fetchone() | |
if latest_url: | |
latest_url = latest_url[0] | |
def parse(self, response): | |
posts = response.xpath('//*[@id="databox"]/li') | |
for post in posts: | |
post_url = response.urljoin(post.xpath('div[@class="info"]/h3/a/@href').extract_first()) | |
post_title = post.xpath('div[@class="info"]/h3/a/text()').extract_first() | |
conference_info = post.xpath('div[@class="info"]/p/text()').extract_first() | |
conference_time = conference_info.split(' ')[0] | |
conference_address = conference_info.split(' ')[-1] | |
item = { | |
'_id': post_url, | |
'post_url': post_url, | |
'post_title': post_title, | |
'conference_time': conference_time, | |
'conference_address': conference_address, | |
'crawl_time': int(time.time()), | |
'site_name': self.name, | |
'type': 'conference', | |
'module': 'brand', | |
} | |
# 把第一条数据作为最新的数据,存储到sqlite中 | |
if not self.first_url: | |
self.first_url = post_url | |
self.cursor.execute('DELETE FROM %s' % self.name) | |
self.cursor.execute('INSERT INTO %s (latest_url) VALUES ("%s")' % (self.name, self.first_url)) | |
self.connection.commit() | |
# 从sqlite中取出上一次最新的数据,与本次的数据做对比,如果相同则认为文章抓到了上次已经抓过的数据,如果不同则认为文章还没有抓完 | |
if post_url == self.latest_url: | |
print '%s - 爬到了上次爬到的地方' % self.name | |
self.connection.close() | |
return | |
yield item |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment