Skip to content

Instantly share code, notes, and snippets.

@axiaoxin
Created April 22, 2015 09:26
Show Gist options
  • Select an option

  • Save axiaoxin/d5233244b318c98cdda9 to your computer and use it in GitHub Desktop.

Select an option

Save axiaoxin/d5233244b318c98cdda9 to your computer and use it in GitHub Desktop.
spider demo
#!/usr/bin/env python
#-*- coding:utf-8 -*-
import re
import urllib2
import sqlite3
from BeautifulSoup import BeautifulSoup
import sys
import time
import getopt
import threading
#待访问的url
urls = []
#下层urls
next_deep_urls = []
#已爬取过的所有URL
did_urls = []
#当前深度
current_deep = 1
#是否进行自测
self_test = False
#sqlite3数据库操作类
class DB(object):
#初始化数据库
def __init__(self, dbfile, logger):
self.conn = sqlite3.connect(dbfile)
self.cur = self.conn.cursor()
self.cur.execute("""create table if not exists t_info(
id integer primary key autoincrement,
keyword text,
url text not null,
content text not null
)""")
#插入含有关键字的页面内容到数据库
def insert(self, v_keyword, v_url, v_content):
try:
self.cur.execute("""insert into t_info(keyword, url, content) values('{keyword}', '{url}', '{content}')""".format(keyword = v_keyword, url = v_url, content = v_content))
self.conn.commit()
except Exception, e:
##print e, v_url
logger.level2(str(e) + v_url)
finally:
#关闭连接
self.cur.close()
self.conn.close()
#显示数据
def show_info(self, v_keyword):
self.cur.execute('''select keyword, url, content from t_info where keyword = "{keyword}"'''.format(keyword = v_keyword))
result = self.cur.fetchall()
for data in result:
print "-"*20
print "keyword :", data[0]
print "url :", data[1]
print "content :", data[2]
print "-"*20
print
#获取页面中所有URL
def get_urls(url, content, logger):
urls = []
try:
soup = BeautifulSoup(content)
#获取以http(s)或者以/开头的href属性到hrefs列表
hrefs = soup.findAll(href=re.compile('^http(s)|/'))
#获取href中的超链接
for href in hrefs:
pattern = re.compile(r'href=(\'|\")(.*?)(\'|\")')
match = pattern.search(str(href))
if match:
#匹配第二个组(url值)
u = match.group(2)
#处理以/开头的链接,加上上级链接成为一个完整链接
if u.startswith('/'):
#去掉上级链接末尾的/
u = url.strip('/') + u
urls.append(u)
except Exception, e:
##print e
logger.level5(str(e))
return urls
#获取页面内容的body内的内容
def get_body(content):
soup = BeautifulSoup(content)
#去掉页面中的\n,以便正则表达式的匹配
body = str(soup('body')).replace('\n','')
#处理页面内容中的英文引号,以免数据库插入时引发引号冲突
body = body.replace('"', "")
body = body.replace("'", "")
#去除页面内的javascript和css代码
script = re.compile(r'<script.*?</script>')
style = re.compile(r'<style.*?</style>')
body = script.sub('', body)
body = style.sub('', body)
#获取标签之间的内容
pattern = re.compile(r">(.*?)<")
text = re.findall(pattern, body)
text = ''.join(text)
return text
#爬取符合条件
def find_data(url, keyword, dbfile, logger):
content = ''
#伪装成浏览器,避免403
headers = {'User-Agent' : 'Mozilla/5.0 (X11; Linux i686; rv:9.0.1) Gecko/20100101 Firefox/9.0.1'}
try:
request = urllib2.Request(url, headers = headers)
instance = urllib2.urlopen(request, timeout = 60)
content = instance.read()
body = get_body(content)
except Exception, e:
##print e, url
logger.level2("exception:%s in %s"%(str(e), url))
body = ''
db = DB(dbfile, logger)
#默认保存全部网页
if keyword == None:
db.insert(keyword, url, body)
##print '----saved----'
##print
##print body
#将含有关键字的页面内容存入数据库
else:
if keyword in body:
##print '----found----'
logger.level5("found " + keyword + " in " + url)
db.insert(keyword, url, body)
####print body
##print '----saved----'
##print
#else:
##print '----None----'
##print
return content
#用法
def usage():
"""
>>> usage()
用法:spider.py -u url [options]
参数:
-u 指定爬虫开始地址
-d 指定爬虫深度
--thread 指定线程池大小,多线程爬取页面,可选参数,默认10
--dbfile 存放结果数据到指定的数据库(sqlite)文件中
--key 页面内的关键词,获取满足该关键词的网页,可选参数,默认为所有页面
-l 日志记录文件记录详细程度,数字越大记录越详细,可选参数,默认spider.log
--testself 程序自测,可选参数
"""
print """ 用法:spider.py -u url [options]
参数:
-u 指定爬虫开始地址
-d 指定爬虫深度
--thread 指定线程池大小,多线程爬取页面,可选参数,默认10
--dbfile 存放结果数据到指定的数据库(sqlite)文件中
--key 页面内的关键词,获取满足该关键词的网页,可选参数,默认为所有页面
-l 日志记录文件记录详细程度,数字越大记录越详细,可选参数,默认spider.log
--testself 程序自测,可选参数"""
#解析命令行参数
def get_args():
url = keyword = None
deep = 1
dbfile = "./spider.db"
logfile = "./spider.log"
log_level = 3
need_log = False
thread_number = 10
opts, args = getopt.getopt(sys.argv[1:],
'u:d:f:l:h',
['url=',
'deep=',
'keyword=',
'help',
'dbfile=',
'logfile=',
'loglevel=',
'thread=',
'testself',
])
for opt, arg in opts:
if opt in ("-u", "--url"):
if not arg.startswith('http://'):
url = 'http://' + arg
else:
url = arg
elif opt in ("-d", "--deep"):
deep = int(arg)
elif opt in ("--keyword"):
keyword = arg
elif opt in ("-h", "--help"):
usage()
elif opt in ("--dbfile"):
dbfile = arg
elif opt in ("-f", "--logfile"):
logfile = arg
need_log = True
elif opt in ("-l", "--loglevel"):
log_level = int(arg)
need_log = True
elif opt in ('--thread'):
thread_number = int(arg)
elif opt in ('--testself'):
global self_test
self_test = True
return (url, keyword, deep, dbfile, logfile, log_level, need_log, thread_number)
#显示进程定时器函数
def progress():
if 0 == len(urls):
print "--- waiting threads end."
else:
print '--- did:', len(did_urls) + 1, 'remain:',len(urls), 'deep:', current_deep
global t_p
t_p = threading.Timer(10.0, progress)
t_p.setDaemon(True)
t_p.start()
#深度爬取
def spider(keyword, deep, dbfile, logger, thread_number):
global urls
global did_urls
global next_deep_urls
global current_deep
lock = threading.RLock()
lock.acquire()
while True:
#达到指定深度就停止
global current_deep
if current_deep > deep:
break
global urls
if [] != urls:
url = urls.pop(0)
#已爬取过的url不再爬取
if url not in did_urls:
##print "+++", threading.currentThread().getName(), 'did:', len(did_urls) + 1 , 'remain:',len(urls), 'deep:', current_deep, 'in:',url
logger.level1("+++ " + threading.currentThread().getName() + ' did:' + str(len(did_urls) + 1) + ' remain:' + str(len(urls)) + ' deep:' + str(current_deep) + ' in:' + str(url))
#将爬取过的url保存
did_urls.append(url)
#获取当前url页面中的内容
content = find_data(url, keyword, dbfile, logger)
#获取当前页面内容中的所有url
deep_urls = get_urls(url, content, logger)
deep_urls = list(set(deep_urls))
##print "***", url, 'have:', len(deep_urls), 'urls'
logger.level4("*** " + url + ' have ' + str(len(deep_urls)) + ' urls')
#将获取到的下层url保存到下层url列表中
next_deep_urls.extend(deep_urls)
logger.level4("*** extended " + str(len(deep_urls)) + ' urls in next_deep_urls')
#当前层的url爬取完毕
if [] == urls:
#将下层url去掉重复的并提取出来
urls = list(set(next_deep_urls))
##print "-*- next deep have:",len(urls),'urls'
logger.level4("-*- next deep have " + str(len(urls)) + ' urls')
#清空下层url
next_deep_urls = []
#进入下一层
current_deep = current_deep + 1
##线程池,这样写是为了避免因延时导致其他线程直接结束
#if current_deep == 2 and deep > 1:
# for i in range(thread_number):
# thread_name = 'spider-'+str(i+1)
# spider_thread = threading.Thread(target=spider, args=(keyword, deep, dbfile, logger, thread_number))
# spider_thread.setName(thread_name)
# spider_thread.start()
# ##print "-+-", thread_name, "started!"
# logger.level3("-+- " + thread_name + " started!")
# time.sleep(10)
#所有URL爬取完毕
else:
break
lock.release()
##print "-+-", threading.currentThread().getName(),"is over!"
logger.level3("-+- " + threading.currentThread().getName() + " is over!")
#日志记录
class Logger:
def __init__(self, logfile, log_level, need_log):
import logging
LEVEL = {1 : 50, #CRITICAL
2 : 40, #ERROR
3 : 30, #WARNING
4 : 20, #INFO
5 : 10, #DEBUG
6 : 0, #NOTSET
}
logging.basicConfig(filename = logfile,
level = LEVEL[log_level],
filemode = 'w',
format = '%(asctime)s *** %(message)s')
self.logger = logging.getLogger('spider_logger')
self.need_log = need_log
def level5(self, msg):
if msg:
if not self.need_log:
pass
else:
self.logger.debug(msg)
def level4(self, msg):
if msg:
if not self.need_log:
pass
else:
self.logger.info(msg)
def level3(self, msg):
if msg:
if not self.need_log:
pass
else:
self.logger.warning(msg)
def level2(self, msg):
if msg:
if not self.need_log:
pass
else:
self.logger.error(msg)
def level1(self, msg):
if msg:
if not self.need_log:
pass
else:
self.logger.critical(msg)
#程序自测
def _test():
import doctest
doctest.testmod()
def main():
url, keyword, deep, dbfile, logfile, log_level, need_log, thread_number = get_args()
threads = []
#程序自测
if self_test:
_test()
#日志记录
logger = Logger(logfile, log_level, need_log)
if url:
#启动进程定时器线程
t_p = threading.Timer(10.0, progress)
t_p.setDaemon(True)
t_p.start()
urls.append(url)
spider(keyword, deep, dbfile, logger, thread_number)
#线程池
for i in range(thread_number):
thread_name = 'spider-'+str(i+1)
spider_thread = threading.Thread(target=spider, args=(keyword, deep, dbfile, logger, thread_number))
spider_thread.setName(thread_name)
spider_thread.start()
##print "-+-", thread_name, "started!"
logger.level3("-+- " + thread_name + " started!")
threads.append(spider_thread)
for t in threads:
if t.isAlive():
t.join()
else:
usage()
logger.level5("exit: not input the url")
sys.exit()
if __name__ == "__main__":
main()
db = DB(dbfile, logger)
db.show_info(keyword)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment