Skip to content

Instantly share code, notes, and snippets.

@seyrenus
Created November 17, 2015 14:51
Show Gist options
  • Save seyrenus/2cdc03c64f254a857461 to your computer and use it in GitHub Desktop.
Save seyrenus/2cdc03c64f254a857461 to your computer and use it in GitHub Desktop.
pyspider script
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# Created on 2015-04-29 17:01:27
# Project: spider
from pyspider.libs.base_handler import *
from pyquery import PyQuery as pq
import MySQLdb
import sys,os
default_encoding = 'utf-8'
if sys.getdefaultencoding() != default_encoding:
reload(sys)
sys.setdefaultencoding(default_encoding)
class Handler(BaseHandler):
crawl_config = {
"headers": {
"User-Agent": "BaiDuSpider",
}
}
@every(minutes=24 * 60)
def on_start(self):
self.crawl('http://s.hc360.com/?w=%C2%C1&mc=enterprise', callback=self.index_page)
@config(age=1 * 24 * 60 * 60)
def index_page(self, response):
for each in response.doc('HTML>BODY>DIV.s-layout>DIV.s-mod-main>DIV.cont-left>DIV.col>DIV.contbox>DL').items():
para = pq(each)
address = para('DD').eq(3).text() and para('DD').eq(3).text() or ''
province =para('DD').eq(2)('a').eq(0).text() and para('DD').eq(2)('a').eq(0).text() or ''
city = para('DD').eq(2)('a').eq(1).text() and para('DD').eq(2)('a').eq(1).text() or ''
companyName = para('DD.til>H3>A').text() and para('DD.til>H3>A').text() or ''
url =para('DD.til>H3>A').attr['href']
# print "address",address
# print "companyName",companyName
# print "province",province
# print "city",city
parameter = {"type":"companyIndex",
"companyName":companyName,
"address":address,
"province":province,
"city":city
}
self.crawl(url, callback=self.detailInformation,save=parameter)
# ÏÂÒ»Ò³
nextUrl = response.doc('HTML>BODY>DIV.s-layout>DIV.s-mod-main>DIV.cont-left>FORM>DIV.s-mod-page>SPAN.page_next.page-n>A').attr['href']
print nextUrl
self.crawl(nextUrl, callback=self.index_page)
@config(priority=2)
def detailInformation(self, response):
return {
"parameter":response.save
}
def on_result(self, result):
if result and result["parameter"]:
sql = "insert into company1(companyname,address,province,city) values(%s, %s, %s, %s)"
data=(result["parameter"]["companyName"], result["parameter"]["address"], result["parameter"]["province"], result["parameter"]["city"])
db = getDB()
cursor = db.cursor()
cursor.execute(sql, data)
db.commit()
cursor.close()
db.close()
#print result["parameter"]["type"]
def getDB():
return MySQLdb.connect(host='10.10.101.10', user='root', passwd='123456', db='test', charset="utf8")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment