-
-
Save ericchen/355c81dc0d0f73994365 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
from scrapy.item import Item, Field | |
class RamenStyle(Item): | |
shop_id = Field() | |
style = Field() | |
soup = Field() | |
class RamenReview(Item): | |
review_id = Field() | |
shop_id = Field() | |
url = Field() | |
user_name = Field() | |
score = Field() | |
title = Field() | |
comment = Field() | |
image = Field() | |
create_date = Field() | |
modified_date = Field() | |
class RamenShop(Item): | |
shop_id = Field() | |
url = Field() | |
name = Field() | |
address = Field() | |
tel = Field() | |
business_hours = Field() | |
closed_on = Field() | |
capacity = Field() | |
smoking = Field() | |
station = Field() | |
access = Field() | |
parking = Field() | |
opening_date = Field() | |
menu = Field() | |
note = Field() | |
tags = Field() | |
score = Field() | |
review_count = Field() | |
review_user_count = Field() | |
average = Field() | |
ranking = Field() | |
like = Field() | |
lat = Field() | |
lon = Field() | |
image = Field() | |
retire = Field() | |
class RamenShopImage(Item): | |
shop_id = Field() | |
blob = Field() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
""" | |
MySQL Pipeline | |
http://snipplr.com/view/66986/mysql-pipeline/ | |
""" | |
import datetime | |
import MySQLdb.cursors | |
from twisted.enterprise import adbapi | |
from decimal import Decimal | |
from scrapy import log | |
from ramendb import settings | |
from ramendb.items import RamenShop, RamenReview, RamenStyle, RamenShopImage | |
class MySQLPipeline(object): | |
def __init__(self): | |
self.dbpool = adbapi.ConnectionPool('MySQLdb', | |
host=settings.MYSQL_HOST, | |
port=settings.MYSQL_PORT, | |
db=settings.MYSQL_DATABASE, | |
user=settings.MYSQL_USER, | |
passwd=settings.MYSQL_PASSWORD, | |
unix_socket=settings.MYSQL_SOCKET, | |
cursorclass=MySQLdb.cursors.DictCursor, | |
charset='utf8', | |
use_unicode=True) | |
self.dbpool.runInteraction(self._truncate_table) | |
def _truncate_table(self, tx): | |
today = datetime.datetime.today() | |
tx.execute('alter table RamenShop rename to RamenShop_%s' | |
% today.strftime('%Y%m%d%H%M%S')) | |
tx.execute('alter table RamenReview rename to RamenReview_%s' | |
% today.strftime('%Y%m%d%H%M%S')) | |
tx.execute('alter table RamenStyle rename to RamenStyle_%s' | |
% today.strftime('%Y%m%d%H%M%S')) | |
tx.execute('create table RamenShop (\ | |
ShopID integer unsigned not null, \ | |
Url text, \ | |
Name text, \ | |
Address text, \ | |
Tel text, \ | |
BusinessHours text, \ | |
ClosedOn text, \ | |
Capacity text, \ | |
Smoking text, \ | |
Station text, \ | |
Access text, \ | |
Parking text, \ | |
OpeningDate text, \ | |
Menu text, \ | |
Note text, \ | |
Tags text, \ | |
Score float, \ | |
ReviewCount integer, \ | |
ReviewUserCount integer, \ | |
Average float, \ | |
Ranking integer, \ | |
`Like` integer, \ | |
Latitude decimal(9,6), \ | |
Longitude decimal(9,6), \ | |
Image text, \ | |
ImageBlob mediumblob, \ | |
Retire bit, \ | |
CreateDate datetime, \ | |
ModifiedDate datetime, \ | |
primary key(ShopID)\ | |
) engine=InnoDB default charset=utf8') | |
tx.execute('create table RamenReview (\ | |
ReviewID integer not null, \ | |
ShopID integer not null, \ | |
Url text, \ | |
UserName text, \ | |
Score float, \ | |
Title text, \ | |
Comment mediumtext, \ | |
Image text, \ | |
CreateDate text, \ | |
ModifiedDate text, \ | |
primary key(ReviewID)\ | |
) engine=InnoDB default charset=utf8') | |
tx.execute('create table RamenStyle (\ | |
ShopID integer not null, \ | |
Style varchar(255) not null, \ | |
Soup varchar(255) not null, \ | |
primary key(ShopID, Style, Soup)\ | |
) engine=InnoDB default charset=utf8') | |
def process_item(self, item, spider): | |
query = self.dbpool.runInteraction(self._conditional_insert, item) | |
query.addErrback(self.handle_error) | |
return item | |
def _conditional_insert(self, tx, item): | |
if isinstance(item, RamenShop): | |
self._insert_shop(tx, item) | |
elif isinstance(item, RamenReview): | |
self._insert_review(tx, item) | |
elif isinstance(item, RamenStyle): | |
self._insert_style(tx, item) | |
elif isinstance(item, RamenShopImage): | |
self._insert_shop_image(tx, item) | |
def _insert_shop(self, tx, item): | |
tx.execute('select ShopID from RamenShop where ShopID = %s', item['shop_id']) | |
result = tx.fetchone() | |
if result: | |
tx.execute('update RamenShop set \ | |
Url = %s, \ | |
Name = %s, \ | |
Address = %s, \ | |
Tel = %s, \ | |
BusinessHours = %s, \ | |
ClosedOn = %s, \ | |
Capacity = %s, \ | |
Smoking = %s, \ | |
Station = %s, \ | |
Access = %s, \ | |
Parking = %s, \ | |
OpeningDate = %s, \ | |
Menu = %s, \ | |
Note = %s, \ | |
Tags = %s, \ | |
Score = %s, \ | |
ReviewCount = %s, \ | |
ReviewUserCount = %s, \ | |
Average = %s, \ | |
Ranking = %s, \ | |
`Like` = %s, \ | |
Latitude = %s, \ | |
Longitude = %s, \ | |
Image = %s, \ | |
Retire = %s, \ | |
CreateDate = %s \ | |
where ShopID = %s', | |
(item['url'] if 'url' in item else None, | |
item['name'] if 'name' in item else None, | |
item['address'] if 'address' in item else None, | |
item['tel'] if 'tel' in item else None, | |
item['business_hours'] if 'business_hours' in item else None, | |
item['closed_on'] if 'closed_on' in item else None, | |
item['capacity'] if 'capacity' in item else None, | |
item['smoking'] if 'smoking' in item else None, | |
item['station'] if 'station' in item else None, | |
item['access'] if 'access' in item else None, | |
item['parking'] if 'parking' in item else None, | |
item['opening_date'] if 'opening_date' in item else None, | |
item['menu'] if 'menu' in item else None, | |
item['note'] if 'note' in item else None, | |
item['tags'] if 'tags' in item else None, | |
Decimal(item['score']) if 'score' in item else None, | |
int(item['review_count']) if 'review_count' in item else None, | |
int(item['review_user_count']) if 'review_user_count' in item else None, | |
Decimal(item['average']) if 'average' in item else None, | |
int(item['ranking']) if 'ranking' in item else None, | |
int(item['like']) if 'like' in item else None, | |
Decimal(item['lat']) if 'lat' in item else None, | |
Decimal(item['lon']) if 'lon' in item else None, | |
item['image'] if 'image' in item else None, | |
item['retire'] if 'retire' in item else None, | |
datetime.datetime.now(), | |
item['shop_id'])) | |
else: | |
tx.execute('insert into RamenShop ( \ | |
ShopID, \ | |
Url, \ | |
Name, \ | |
Address, \ | |
Tel, \ | |
BusinessHours, \ | |
ClosedOn, \ | |
Capacity, \ | |
Smoking, \ | |
Station, \ | |
Access, \ | |
Parking, \ | |
OpeningDate, \ | |
Menu, \ | |
Note, \ | |
Tags, \ | |
Score, \ | |
ReviewCount, \ | |
ReviewUserCount, \ | |
Average, \ | |
Ranking, \ | |
`Like`, \ | |
Latitude, \ | |
Longitude, \ | |
Image, \ | |
Retire, \ | |
CreateDate \ | |
) values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)', | |
(int(item['shop_id']), | |
item['url'] if 'url' in item else None, | |
item['name'] if 'name' in item else None, | |
item['address'] if 'address' in item else None, | |
item['tel'] if 'tel' in item else None, | |
item['business_hours'] if 'business_hours' in item else None, | |
item['closed_on'] if 'closed_on' in item else None, | |
item['capacity'] if 'capacity' in item else None, | |
item['smoking'] if 'smoking' in item else None, | |
item['station'] if 'station' in item else None, | |
item['access'] if 'access' in item else None, | |
item['parking'] if 'parking' in item else None, | |
item['opening_date'] if 'opening_date' in item else None, | |
item['menu'] if 'menu' in item else None, | |
item['note'] if 'note' in item else None, | |
item['tags'] if 'tags' in item else None, | |
Decimal(item['score']) if 'score' in item else None, | |
int(item['review_count']) if 'review_count' in item else None, | |
int(item['review_user_count']) if 'review_user_count' in item else None, | |
Decimal(item['average']) if 'average' in item else None, | |
int(item['ranking']) if 'ranking' in item else None, | |
int(item['like']) if 'like' in item else None, | |
Decimal(item['lat']) if 'lat' in item else None, | |
Decimal(item['lon']) if 'lon' in item else None, | |
item['image'] if 'image' in item else None, | |
item['retire'] if 'retire' in item else None, | |
datetime.datetime.now())) | |
def _insert_shop_image(self, tx, item): | |
tx.execute('select ShopID from RamenShop where ShopID = %s', item['shop_id']) | |
result = tx.fetchone() | |
if result: | |
tx.execute('update RamenShop set ImageBlob = %s where ShopID = %s', | |
(item['blob'] if 'blob' in item else None, item['shop_id'])) | |
else: | |
tx.execute('insert into RamenShop (ShopID, ImageBlob) values (%s, %s)', | |
(int(item['shop_id']), item['blob']) if 'blob' in item else None) | |
def _insert_review(self, tx, item): | |
tx.execute('select ReviewID from RamenReview where ReviewID = %s', item['review_id']) | |
result = tx.fetchone() | |
if result: | |
pass | |
else: | |
tx.execute('insert into RamenReview ( \ | |
ReviewID, \ | |
ShopID, \ | |
Url, \ | |
UserName, \ | |
Score, \ | |
Title, \ | |
Comment, \ | |
Image, \ | |
CreateDate, \ | |
ModifiedDate \ | |
) values (%s,%s,%s,%s,%s,%s,%s,%s,%s, %s)', | |
(int(item['review_id']), | |
int(item['shop_id']), | |
item['url'] if 'url' in item else None, | |
item['user_name'] if 'user_name' in item else None, | |
Decimal(item['score']) if 'score' in item else None, | |
item['title'] if 'title' in item else None, | |
item['comment'] if 'comment' in item else None, | |
item['image'] if 'image' in item else None, | |
item['create_date'] if 'create_date' in item else None, | |
item['modified_date'] if 'modified_date' in item else None)) | |
def _insert_style(self, tx, item): | |
tx.execute('select ShopID from RamenStyle where ShopID = %s and Style = %s and Soup = %s', [item['shop_id'], item['style'], item['soup']]) | |
result = tx.fetchone() | |
if result: | |
pass | |
els:e | |
tx.execute('insert into RamenStyle ( \ | |
ShopID, \ | |
Style, \ | |
Soup \ | |
) values (%s,%s,%s)', | |
(int(item['shop_id']), | |
item['style'], | |
item['soup'])) | |
def handle_error(self, e): | |
log.err(e) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
import re | |
from boto.s3.key import Key | |
from scrapy.spider import BaseSpider | |
from scrapy.selector import HtmlXPathSelector | |
from ramendb.items import RamenReview, RamenStyle, RamenShop, RamenShopImage | |
from scrapy.http.request import Request | |
from scrapy import log | |
import json | |
from ramendb.spiders import s3bucket | |
def _remove_html_tags(data): | |
p = re.compile(r'<.*?>') | |
return p.sub('', data) | |
def _remove_anchor_tag(data): | |
p = re.compile(r'<a.*?>.*?</a>') | |
return p.sub('', data) | |
def _remove_tab_code(data): | |
return data.replace('\t', '') | |
def _remove_lf_code(data): | |
return data.replace('\n', '') | |
def _parse_break_tag(data): | |
p = re.compile(r'<br.*?>') | |
return p.sub('\n', data.replace('\n', '')) | |
def _find_keyword(value): | |
# http://ramendb.supleks.jp/search/shop?q=%E9%86%A4%E6%B2%B9 | |
m = re.search('q=(.*?)$', value) | |
if m: | |
return m.group(1) | |
def _find_unique_id(value): | |
# http://ramendb.supleks.jp/s/12559.html | |
m = re.search('\/(\d+?).html', value) | |
if m: | |
return m.group(1) | |
def _find_next_link(value): | |
# window.location.href='/search/shop?page=100&q=%E9%86%A4%E6%B2%B9';return false; | |
m = re.search('href=\'(.*?)\'', value) | |
if m: | |
return m.group(1) | |
def _find_lat_lon(value): | |
# http://maps.google.com/maps/api/staticmap?center=35.88240910824,139.82840538025&zoom=15&size=204x180&markers=icon:http://s1.supleks.jp/shopdb/icon16_star_yellow.png|35.88240910824,139.82840538025&language=ja&sensor=false | |
m = re.search('center=([-|\.|\d]+?),([-|\.|\d]+?)&', value) | |
if m: | |
return m.group(1, 2) | |
def _find_create_modified_date(value): | |
# 2012年7月3日投稿(2012年7月4日更新) | | |
m = re.search(u'^(\d+年\d+月\d+日)?(投稿)?(()?(\d+年\d+月\d+日)?(更新))? |', value, re.U) | |
if m: | |
return m.group(1, 4) | |
def _find_style(value): | |
# [汁なし/鶏白湯] | |
m = re.search('\[(.*?)/(.*?)\]', value) | |
if m: | |
return m.group(1, 2) | |
def _find_graph_array(key, value): | |
m = re.search("reviewGraph\.addData\(\'" + key + "\'\, (.*?)\)\;", value) | |
if m: | |
return m.group(1).replace('name', '"name"').replace('data', '"data"').replace("'", '"') | |
class RamenDBSpider(BaseSpider): | |
name = 'ramendb' | |
allowed_domains = ['ramendb.supleks.jp'] | |
start_urls = [ | |
# 醤油 | |
# 'http://ramendb.supleks.jp/search/shop?q=%E9%86%A4%E6%B2%B9', | |
# 味噌 | |
# 'http://ramendb.supleks.jp/search/shop?q=%e5%91%b3%e5%99%8c', | |
# 塩 | |
# 'http://ramendb.supleks.jp/search/shop?q=%e5%a1%a9', | |
# 豚骨 | |
# 'http://ramendb.supleks.jp/search/shop?q=%e8%b1%9a%e9%aa%a8', | |
# 鶏白湯 | |
# 'http://ramendb.supleks.jp/search/shop?q=%e9%b6%8f%e7%99%bd%e6%b9%af', | |
# 担々麺 | |
# 'http://ramendb.supleks.jp/search/shop?q=%e6%8b%85%e3%80%85%e9%ba%ba', | |
# カレー | |
# 'http://ramendb.supleks.jp/search/shop?q=%e3%82%ab%e3%83%ac%e3%83%bc', | |
# Ranking | |
# 'http://ramendb.supleks.jp/rank' | |
# Search | |
'http://ramendb.supleks.jp/search' | |
] | |
def parse(self, response): | |
hxs = HtmlXPathSelector(response) | |
""" | |
for a in hxs.select('//ul[@class="searched"]//a[@class="name"]'): | |
href = a.select('@href').extract()[0] | |
request = Request('http://ramendb.supleks.jp' + href, self.parse_shop) | |
yield request | |
for a in hxs.select('//span[@class="name"]/a'): | |
href = a.select('@href').extract()[0] | |
request = Request('http://ramendb.supleks.jp' + href, self.parse_shop) | |
yield request | |
""" | |
for a in hxs.select('//div[@class="name"]/h4/a'): | |
href = a.select('@href').extract()[0] | |
request = Request('http://ramendb.supleks.jp' + href, self.parse_shop) | |
yield request | |
""" | |
for a in hxs.select('//div[@class="menu"]/span/a'): | |
href = a.select('@href').extract()[0] | |
request = Request('http://ramendb.supleks.jp' + href, self.parse_review) | |
yield request | |
""" | |
for page in hxs.select('//a[@class="page"]'): | |
href = _find_next_link(page.select('@onclick').extract()[0]) | |
request = Request('http://ramendb.supleks.jp' + href, self.parse) | |
yield request | |
def parse_review(self, response): | |
hxs = HtmlXPathSelector(response) | |
alert = hxs.select('//*[@id="waku"]') | |
if len(alert) > 0: | |
return | |
style = RamenStyle() | |
style['shop_id'] = _find_unique_id(hxs.select('//div[@class="props"]/span[1]/a[1]/@href').extract()[0]) | |
style['style'], style['soup'] = _find_style(hxs.select('//span[@class="style"]/text()').extract()[0]) | |
yield style | |
review = RamenReview() | |
review['shop_id'] = style['shop_id'] | |
review['review_id'] = _find_unique_id(hxs.select('//div[@class="menu"]/span/a/@href').extract()[0]) | |
review['url'] = response.url | |
review['user_name'] = hxs.select('//div[@class="props"]/span[2]/a/text()').extract()[0] | |
review['score'] = hxs.select('//div[@class="score"]/text()').extract()[0] | |
review['title'] = hxs.select('//div[@class="menu"]/span/a/text()').extract()[0] | |
review['comment'] = _remove_html_tags(_parse_break_tag(hxs.select('//div[@class="comment"]/p').extract()[0])) | |
image = hxs.select('//div[@class="comment"]/p/a/img/@src').extract() | |
if len(image) > 0: | |
review['image'] = image[0] | |
review['create_date'], review['modified_date'] = _find_create_modified_date(hxs.select('//div[@class="foot"]//div[@class="right"]/text()').extract()[0]) | |
yield review | |
def parse_shop(self, response): | |
hxs = HtmlXPathSelector(response) | |
alert = hxs.select('//div[@class="alertbox"]') | |
if len(alert) > 0: | |
return | |
""" | |
for a in hxs.select('//a[@class="btn-more"]'): | |
href = a.select('@href').extract()[0] | |
if not href.endswith('/photo/review'): | |
request = Request('http://ramendb.supleks.jp' + href, self.parse) | |
yield request | |
""" | |
shop = RamenShop() | |
shop['retire'] = True if len(hxs.select('//span[@class="plate retire"]/text()')) > 0 else False | |
#shop['url'] = hxs.select('//meta[@property="og:url"]/@content').extract()[0] | |
shop['url'] = response.url | |
shop['shop_id'] = _find_unique_id(shop['url']) | |
shop['image'] = hxs.select('//meta[@property="og:image"]/@content').extract()[0] | |
shop['lat'], shop['lon'] = _find_lat_lon(hxs.select('//a[@id="minimap"]//img/@src').extract()[0]) | |
point = hxs.select('//div[@id="point"]') | |
point_float = point.select('span[@class="float"]') | |
if len(point) > 0 and len(point_float) > 0: | |
shop['score'] = point.select('span[@class="int"]/text()').extract()[0] + point.select('span[@class="float"]/text()').extract()[0] | |
for row in hxs.select('//table[@id="data-table"]//tr'): | |
th = row.select('th/text()').extract()[0] | |
if th == u'店名': | |
shop['name'] = row.select('td/strong/text()').extract()[0] | |
elif th == u'住所': | |
shop['address'] = _remove_lf_code(_remove_tab_code(_remove_html_tags(row.select('td').extract()[0]))) | |
elif th == u'電話番号': | |
shop['tel'] = row.select('td/text()').extract()[0] | |
elif th == u'営業時間': | |
shop['business_hours'] = row.select('td/text()').extract()[0] | |
elif th == u'定休日': | |
shop['closed_on'] = row.select('td/text()').extract()[0] | |
elif th == u'席数': | |
shop['capacity'] = row.select('td/text()').extract()[0] | |
elif th == u'喫煙': | |
shop['smoking'] = row.select('td/text()').extract()[0] | |
elif th == u'最寄り駅': | |
shop['station'] = _remove_lf_code(_remove_tab_code(_remove_html_tags(row.select('td').extract()[0]))) | |
elif th == u'アクセス': | |
shop['access'] = row.select('td/text()').extract()[0] | |
elif th == u'駐車場': | |
shop['parking'] = row.select('td/text()').extract()[0] | |
elif th == u'開店日': | |
shop['opening_date'] = row.select('td/text()').extract()[0] | |
elif th == u'メニュー': | |
shop['menu'] = _remove_html_tags(_remove_anchor_tag(_parse_break_tag(row.select('td/p[@class="more"]').extract()[0]))) | |
elif th == u'備考': | |
shop['note'] = _remove_html_tags(_remove_anchor_tag(_parse_break_tag(row.select('td/p[@class="more"]/node()').extract()[0]))) | |
elif th == u'タグ': | |
shop['tags'] = ','.join([node for node in row.select('td/a/text()').extract()]) | |
for row in hxs.select('//table[@class="key-value"]//tr'): | |
th = row.select('th/text()').extract()[0] | |
if th == u'レビュー件数': | |
shop['review_count'] = row.select('td/text()').extract()[0][:-1].replace(',', '') | |
elif th == u'レビューユーザー数': | |
shop['review_user_count'] = row.select('td/text()').extract()[0][:-1].replace(',', '') | |
elif th == u'平均点': | |
shop['average'] = row.select('td/text()').extract()[0][:-1].replace(',', '') | |
elif th == u'総合順位': | |
ranking = row.select('td/text()').extract()[0][:-1].replace(',', '') | |
if len(ranking) > 0: | |
shop['ranking'] = row.select('td/text()').extract()[0][:-1].replace(',', '') | |
elif th == u'スキ': | |
shop['like'] = row.select('td/text()').extract()[0][:-1].replace(',', '') | |
yield shop | |
graph = hxs.select('//div[@id="graph"]/following-sibling::node()[name()="script"]/text()').extract() | |
if len(graph) > 0: | |
styles = [] | |
graphArrayStyle = _find_graph_array('all_0', graph[0]) | |
if graphArrayStyle is not None: | |
jsonStyles = json.loads(graphArrayStyle) | |
for jsonStyle in jsonStyles: | |
if sum(jsonStyle['data']) > 0: | |
styles.append(jsonStyle['name']) | |
soups = [] | |
graphArraySoup = _find_graph_array('all_1', graph[0]) | |
if graphArraySoup is not None: | |
jsonSoups = json.loads(graphArraySoup) | |
for jsonSoup in jsonSoups: | |
if sum(jsonSoup['data']) > 0: | |
soups.append(jsonSoup['name']) | |
for style in styles: | |
for soup in soups: | |
ramenStyle = RamenStyle() | |
ramenStyle['shop_id'] = shop['shop_id'] | |
ramenStyle['style'] = style | |
ramenStyle['soup'] = soup | |
yield ramenStyle | |
if shop['image']: | |
request = Request( | |
headers = {'shop_id': shop['shop_id']}, | |
url = shop['image'], | |
callback = self.parse_shop_image) | |
yield request | |
def parse_shop_image(self, response): | |
image = RamenShopImage() | |
image['shop_id'] = response.request.headers.get('shop_id') | |
image['blob'] = response.body | |
key = Key(s3bucket) | |
key.key = 'ramendb/img/shop/%s.jpg' % image['shop_id'] | |
key.set_contents_from_string(response.body, replace=True, headers={'Content-Type': 'image/jpeg'}) | |
key.set_acl('public-read') | |
return image |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment