Last active
August 29, 2015 14:03
-
-
Save aguegu/e45f24b2c74a0f2bc661 to your computer and use it in GitHub Desktop.
BendixCrawler
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# EditorConfig is awesome: http://EditorConfig.org | |
# top-most EditorConfig file | |
root = true | |
# Unix-style newlines with a newline ending every file | |
[*] | |
end_of_line = lf | |
insert_final_newline = true | |
trim_trailing_whitespace = true | |
insert_final_newline = true | |
charset = utf-8 | |
indent_style = space | |
[*.py] | |
indent_size = 4 | |
tab_width = 4 | |
[*.yml] | |
indent_size = 2 | |
tab_width = 2 | |
[Makefile] | |
indent_style = tab | |
indent_size = 4 | |
tab_width = 4 |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
*.pyc | |
*.sqlite | |
*.zip | |
pdf/ | |
gif/ |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
sql: | |
brand: | |
init: | | |
DROP TABLE IF EXISTS BRAND; | |
CREATE TABLE BRAND ( | |
ID INTEGER PRIMARY KEY AUTOINCREMENT, | |
TITLE TEXT NOT NULL UNIQUE); | |
insert: | | |
INSERT INTO BRAND (TITLE) VALUES (?); | |
select: | | |
SELECT TITLE, | |
ID | |
FROM BRAND; | |
serie: | |
init: | | |
DROP TABLE IF EXISTS SERIE; | |
CREATE TABLE SERIE ( | |
ID INTEGER PRIMARY KEY AUTOINCREMENT, | |
TITLE TEXT NOT NULL, | |
BRAND_ID INTEGER NOT NULL); | |
insert: | | |
INSERT INTO SERIE (TITLE, BRAND_ID) VALUES (?, ?); | |
select: | | |
SELECT BRAND.TITLE, | |
SERIE.TITLE, | |
SERIE.ID | |
FROM BRAND, SERIE | |
WHERE SERIE.BRAND_ID = BRAND.ID; | |
model: | |
init: | | |
DROP TABLE IF EXISTS MODEL; | |
CREATE TABLE MODEL ( | |
ID INTEGER PRIMARY KEY AUTOINCREMENT, | |
HREF TEXT NOT NULL, | |
TITLE TEXT NOT NULL, | |
YEAR TEXT NOT NULL, | |
FRONT TEXT, | |
REAR TEXT, | |
PARK TEXT, | |
SERIE_ID INTEGER NOT NULL); | |
insert: | | |
INSERT INTO MODEL ( | |
HREF, | |
TITLE, | |
YEAR, | |
FRONT, | |
REAR, | |
PARK, | |
SERIE_ID) | |
VALUES (?, ?, ?, ?, ?, ?, ?); | |
select: | | |
SELECT HREF, | |
ID | |
FROM MODEL | |
WHERE ID | |
NOT IN (SELECT DISTINCT MODEL_ID | |
FROM PRODUCT); | |
product: | |
init: | | |
CREATE TABLE IF NOT EXISTS PRODUCT ( | |
ID INTEGER PRIMARY KEY AUTOINCREMENT, | |
TITLE TEXT NOT NULL, | |
CONTENT TEXT, | |
IMAGE TEXT, | |
PDF TEXT, | |
MODEL_ID INTEGER); | |
insert: | | |
INSERT INTO PRODUCT ( | |
TITLE, | |
CONTENT, | |
IMAGE, | |
PDF, | |
MODEL_ID) | |
VALUES (?, ?, ?, ?, ?); | |
image: | |
select: | | |
SELECT DISTINCT IMAGE | |
FROM PRODUCT | |
WHERE IMAGE != ""; | |
pdf: | |
select: | | |
SELECT DISTINCT PDF | |
FROM PRODUCT | |
WHERE PDF != "" | |
AND PDF != '#'; | |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
import gevent.monkey | |
gevent.monkey.patch_socket() | |
import requests | |
import gevent | |
import simplejson | |
import pickle | |
from pyquery import PyQuery | |
from gevent.queue import JoinableQueue, Queue | |
from sqliter import Sqliter | |
import yaml | |
with open('config.yml') as f: | |
rules = yaml.load(f) | |
def queue2list(q): | |
l = [] | |
while not q.empty(): | |
l.append(q.get()) | |
return l | |
def list2queue(l, queue_type=Queue): | |
q = queue_type() | |
map(q.put, l) | |
return q | |
def pick(s, tag): | |
s = s.replace('\\x3c', '<').replace('\\x3e', '>') | |
s = s.replace('\\"', '"') | |
s = s[s.rfind('<%s' % tag): s.rfind('</%s>' % tag) + len('</%s>' % tag)] | |
return s | |
def inject(q_out, conn, sql): | |
running = True | |
while running: | |
l = queue2list(q_out) | |
if len(l): | |
if isinstance(l[-1], str) and l[-1] == 'done': | |
running = False | |
l.pop() | |
conn.execute_many(sql, l) | |
gevent.sleep(2) | |
def fetch_brand(q_in, q_out): | |
while True: | |
s = q_in.get() | |
if isinstance(s, str) and s == 'done': | |
q_in.task_done() | |
break | |
while True: | |
try: | |
r = requests.get('http://www.bendix.com.au/autocomplete/manufacturer/%c' % s) | |
j = r.json() | |
except simplejson.JSONDecodeError: | |
pass | |
else: | |
if type(j) is dict: | |
map(q_out.put, [(x, ) for x in j.values()]) | |
q_in.task_done() | |
break | |
def get_brand(): | |
conn = Sqliter('bendix.sqlite') | |
conn.execute_script(rules['sql']['brand']['init']) | |
q_in = list2queue([chr(x) for x in range(ord('A'), ord('Z') + 1)], JoinableQueue) | |
q_out = Queue() | |
for _ in xrange(4): | |
q_in.put('done') | |
gevent.spawn(fetch_brand, q_in, q_out) | |
injector = gevent.spawn(inject, q_out, conn, rules['sql']['brand']['insert']) | |
q_in.join() | |
q_out.put('done') | |
injector.join() | |
def fetch_serie(q_in, q_out): | |
while True: | |
brand = q_in.get() | |
if isinstance(brand, str) and brand == 'done': | |
q_in.task_done() | |
break | |
params = dict( | |
manufacturer=brand[0], | |
view_name="search_by_vehicle", | |
view_display_id="page_1", | |
) | |
r = requests.get('http://www.bendix.com.au/views/ajax', params=params) | |
s = pick(r.text, 'select') | |
d = PyQuery(s) | |
l = [e.attr['value'] for e in d('option').items() if e.attr['value']] | |
map(q_out.put, zip(l, [brand[1]]*len(l))) | |
q_in.task_done() | |
def get_serie(): | |
conn = Sqliter('bendix.sqlite') | |
conn.execute_script(rules['sql']['serie']['init']) | |
q_in = list2queue(conn.select(rules['sql']['brand']['select']), JoinableQueue) | |
q_out = Queue() | |
for _ in xrange(8): | |
q_in.put('done') | |
gevent.spawn(fetch_serie, q_in, q_out) | |
injector = gevent.spawn(inject, q_out, conn, rules['sql']['serie']['insert']) | |
q_in.join() | |
q_out.put('done') | |
injector.join() | |
def fetch_model(q_in, q_out): | |
while True: | |
serie = q_in.get() | |
if isinstance(serie, str) and serie == 'done': | |
q_in.task_done() | |
break | |
print serie[2] | |
params = dict( | |
manufacturer=serie[0], | |
model=serie[1], | |
view_name="search_by_vehicle", | |
view_display_id="page_1", | |
) | |
r=requests.get('http://www.bendix.com.au/views/ajax', params=params) | |
s = pick(r.text, 'tbody') | |
d = PyQuery(s) | |
l = zip ( | |
[x.attr['href'] for x in d('tr').items()], | |
[x.text() for x in d('td.views-field-date a').items()], | |
[x.text() for x in d('td.views-field-variant a').items()], | |
[x.text() for x in d('td.views-field-front a').items()], | |
[x.text() for x in d('td.views-field-rear a').items()], | |
[x.text() for x in d('td.views-field-park a').items()], | |
[serie[2] for _ in d('tr').items()]) | |
map(q_out.put, l) | |
q_in.task_done() | |
def get_model(): | |
conn = Sqliter('bendix.sqlite') | |
conn.execute_script(rules['sql']['model']['init']) | |
q_in = list2queue(conn.select(rules['sql']['serie']['select']), JoinableQueue) | |
q_out = Queue() | |
for _ in xrange(16): | |
q_in.put('done') | |
gevent.spawn(fetch_model, q_in, q_out) | |
injector = gevent.spawn(inject, q_out, conn, rules['sql']['model']['insert']) | |
q_in.join() | |
q_out.put('done') | |
injector.join() | |
def fetch_product(q_in, q_out): | |
while True: | |
model = q_in.get() | |
if isinstance(model, str) and model == 'done': | |
q_in.task_done() | |
break | |
print model[1] | |
try: | |
r = requests.get('http://www.bendix.com.au%s' % model[0]) | |
d = PyQuery(r.text) | |
except requests.ConnectionError, e: | |
print model[1], e | |
else: | |
for p in d('table.single-vehicle:not(:empty)').items(): | |
q_out.put((p('span').text(), | |
p('p:last').text(), | |
p('td.image img').attr('src'), | |
p('a:last').attr('href'), | |
model[1])) | |
finally: | |
q_in.task_done() | |
def get_product(): | |
conn = Sqliter('bendix.sqlite') | |
conn.execute_script(rules['sql']['product']['init']) | |
q_in = list2queue(conn.select(rules['sql']['model']['select']), JoinableQueue) | |
q_out = Queue() | |
for _ in xrange(16): | |
q_in.put('done') | |
gevent.spawn(fetch_product, q_in, q_out) | |
injector = gevent.spawn(inject, q_out, conn, rules['sql']['product']['insert']) | |
q_in.join() | |
q_out.put('done') | |
injector.join() | |
def fetch_file(q_in, dest): | |
while True: | |
url = q_in.get() | |
if isinstance(url, str) and url == 'done': | |
q_in.task_done() | |
break | |
filename = url[0][url[0].rfind('/') + 1:] | |
u = 'http://www.bendix.com.au' + url[0] if url[0][0] == '/' else url[0] | |
r = requests.get(u) | |
with open('%s/%s' % (dest, filename), 'wb') as f: | |
f.write(r.content) | |
q_in.task_done() | |
def get_file(): | |
conn = Sqliter('bendix.sqlite') | |
q_in = list2queue(conn.select(rules['sql']['image']['select']), JoinableQueue) | |
for _ in xrange(16): | |
q_in.put('done') | |
gevent.spawn(fetch_file, q_in, 'gif') | |
q_in.join() | |
q_in = list2queue(conn.select(rules['sql']['pdf']['select']), JoinableQueue) | |
for _ in xrange(16): | |
q_in.put('done') | |
gevent.spawn(fetch_file, q_in, 'pdf') | |
q_in.join() | |
if __name__ == '__main__': | |
#get_brand() | |
#get_serie() | |
#get_model() | |
#get_product() | |
get_file() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
import sqlite3 | |
import re | |
class Sqliter: | |
@staticmethod | |
def match(expr, item): | |
return re.match(expr, item) is not None | |
def __init__(self, db): | |
self.conn = sqlite3.connect(db) | |
self.conn.create_function("MATCHES", 2, Sqliter.match) | |
def select(self, sql, row_factory = None): | |
self.conn.row_factory = row_factory | |
cursor = self.conn.cursor() | |
cursor.execute(sql) | |
lst = cursor.fetchall() | |
cursor.close() | |
return lst | |
def execute_script(self, sql): | |
self.conn.executescript(sql) | |
self.conn.commit() | |
def execute_many(self, sql, lst): | |
self.conn.executemany(sql, lst) | |
self.conn.commit() | |
def __del__(self): | |
self.conn.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment