Created
April 22, 2015 09:26
-
-
Save axiaoxin/d5233244b318c98cdda9 to your computer and use it in GitHub Desktop.
spider demo
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python | |
| #-*- coding:utf-8 -*- | |
| import re | |
| import urllib2 | |
| import sqlite3 | |
| from BeautifulSoup import BeautifulSoup | |
| import sys | |
| import time | |
| import getopt | |
| import threading | |
| #待访问的url | |
| urls = [] | |
| #下层urls | |
| next_deep_urls = [] | |
| #已爬取过的所有URL | |
| did_urls = [] | |
| #当前深度 | |
| current_deep = 1 | |
| #是否进行自测 | |
| self_test = False | |
| #sqlite3数据库操作类 | |
| class DB(object): | |
| #初始化数据库 | |
| def __init__(self, dbfile, logger): | |
| self.conn = sqlite3.connect(dbfile) | |
| self.cur = self.conn.cursor() | |
| self.cur.execute("""create table if not exists t_info( | |
| id integer primary key autoincrement, | |
| keyword text, | |
| url text not null, | |
| content text not null | |
| )""") | |
| #插入含有关键字的页面内容到数据库 | |
| def insert(self, v_keyword, v_url, v_content): | |
| try: | |
| self.cur.execute("""insert into t_info(keyword, url, content) values('{keyword}', '{url}', '{content}')""".format(keyword = v_keyword, url = v_url, content = v_content)) | |
| self.conn.commit() | |
| except Exception, e: | |
| ##print e, v_url | |
| logger.level2(str(e) + v_url) | |
| finally: | |
| #关闭连接 | |
| self.cur.close() | |
| self.conn.close() | |
| #显示数据 | |
| def show_info(self, v_keyword): | |
| self.cur.execute('''select keyword, url, content from t_info where keyword = "{keyword}"'''.format(keyword = v_keyword)) | |
| result = self.cur.fetchall() | |
| for data in result: | |
| print "-"*20 | |
| print "keyword :", data[0] | |
| print "url :", data[1] | |
| print "content :", data[2] | |
| print "-"*20 | |
| #获取页面中所有URL | |
| def get_urls(url, content, logger): | |
| urls = [] | |
| try: | |
| soup = BeautifulSoup(content) | |
| #获取以http(s)或者以/开头的href属性到hrefs列表 | |
| hrefs = soup.findAll(href=re.compile('^http(s)|/')) | |
| #获取href中的超链接 | |
| for href in hrefs: | |
| pattern = re.compile(r'href=(\'|\")(.*?)(\'|\")') | |
| match = pattern.search(str(href)) | |
| if match: | |
| #匹配第二个组(url值) | |
| u = match.group(2) | |
| #处理以/开头的链接,加上上级链接成为一个完整链接 | |
| if u.startswith('/'): | |
| #去掉上级链接末尾的/ | |
| u = url.strip('/') + u | |
| urls.append(u) | |
| except Exception, e: | |
| ##print e | |
| logger.level5(str(e)) | |
| return urls | |
| #获取页面内容的body内的内容 | |
| def get_body(content): | |
| soup = BeautifulSoup(content) | |
| #去掉页面中的\n,以便正则表达式的匹配 | |
| body = str(soup('body')).replace('\n','') | |
| #处理页面内容中的英文引号,以免数据库插入时引发引号冲突 | |
| body = body.replace('"', "") | |
| body = body.replace("'", "") | |
| #去除页面内的javascript和css代码 | |
| script = re.compile(r'<script.*?</script>') | |
| style = re.compile(r'<style.*?</style>') | |
| body = script.sub('', body) | |
| body = style.sub('', body) | |
| #获取标签之间的内容 | |
| pattern = re.compile(r">(.*?)<") | |
| text = re.findall(pattern, body) | |
| text = ''.join(text) | |
| return text | |
| #爬取符合条件 | |
| def find_data(url, keyword, dbfile, logger): | |
| content = '' | |
| #伪装成浏览器,避免403 | |
| headers = {'User-Agent' : 'Mozilla/5.0 (X11; Linux i686; rv:9.0.1) Gecko/20100101 Firefox/9.0.1'} | |
| try: | |
| request = urllib2.Request(url, headers = headers) | |
| instance = urllib2.urlopen(request, timeout = 60) | |
| content = instance.read() | |
| body = get_body(content) | |
| except Exception, e: | |
| ##print e, url | |
| logger.level2("exception:%s in %s"%(str(e), url)) | |
| body = '' | |
| db = DB(dbfile, logger) | |
| #默认保存全部网页 | |
| if keyword == None: | |
| db.insert(keyword, url, body) | |
| ##print '----saved----' | |
| ##print body | |
| #将含有关键字的页面内容存入数据库 | |
| else: | |
| if keyword in body: | |
| ##print '----found----' | |
| logger.level5("found " + keyword + " in " + url) | |
| db.insert(keyword, url, body) | |
| ####print body | |
| ##print '----saved----' | |
| #else: | |
| ##print '----None----' | |
| return content | |
| #用法 | |
| def usage(): | |
| """ | |
| >>> usage() | |
| 用法:spider.py -u url [options] | |
| 参数: | |
| -u 指定爬虫开始地址 | |
| -d 指定爬虫深度 | |
| --thread 指定线程池大小,多线程爬取页面,可选参数,默认10 | |
| --dbfile 存放结果数据到指定的数据库(sqlite)文件中 | |
| --key 页面内的关键词,获取满足该关键词的网页,可选参数,默认为所有页面 | |
| -l 日志记录文件记录详细程度,数字越大记录越详细,可选参数,默认spider.log | |
| --testself 程序自测,可选参数 | |
| """ | |
| print """ 用法:spider.py -u url [options] | |
| 参数: | |
| -u 指定爬虫开始地址 | |
| -d 指定爬虫深度 | |
| --thread 指定线程池大小,多线程爬取页面,可选参数,默认10 | |
| --dbfile 存放结果数据到指定的数据库(sqlite)文件中 | |
| --key 页面内的关键词,获取满足该关键词的网页,可选参数,默认为所有页面 | |
| -l 日志记录文件记录详细程度,数字越大记录越详细,可选参数,默认spider.log | |
| --testself 程序自测,可选参数""" | |
| #解析命令行参数 | |
| def get_args(): | |
| url = keyword = None | |
| deep = 1 | |
| dbfile = "./spider.db" | |
| logfile = "./spider.log" | |
| log_level = 3 | |
| need_log = False | |
| thread_number = 10 | |
| opts, args = getopt.getopt(sys.argv[1:], | |
| 'u:d:f:l:h', | |
| ['url=', | |
| 'deep=', | |
| 'keyword=', | |
| 'help', | |
| 'dbfile=', | |
| 'logfile=', | |
| 'loglevel=', | |
| 'thread=', | |
| 'testself', | |
| ]) | |
| for opt, arg in opts: | |
| if opt in ("-u", "--url"): | |
| if not arg.startswith('http://'): | |
| url = 'http://' + arg | |
| else: | |
| url = arg | |
| elif opt in ("-d", "--deep"): | |
| deep = int(arg) | |
| elif opt in ("--keyword"): | |
| keyword = arg | |
| elif opt in ("-h", "--help"): | |
| usage() | |
| elif opt in ("--dbfile"): | |
| dbfile = arg | |
| elif opt in ("-f", "--logfile"): | |
| logfile = arg | |
| need_log = True | |
| elif opt in ("-l", "--loglevel"): | |
| log_level = int(arg) | |
| need_log = True | |
| elif opt in ('--thread'): | |
| thread_number = int(arg) | |
| elif opt in ('--testself'): | |
| global self_test | |
| self_test = True | |
| return (url, keyword, deep, dbfile, logfile, log_level, need_log, thread_number) | |
| #显示进程定时器函数 | |
| def progress(): | |
| if 0 == len(urls): | |
| print "--- waiting threads end." | |
| else: | |
| print '--- did:', len(did_urls) + 1, 'remain:',len(urls), 'deep:', current_deep | |
| global t_p | |
| t_p = threading.Timer(10.0, progress) | |
| t_p.setDaemon(True) | |
| t_p.start() | |
| #深度爬取 | |
| def spider(keyword, deep, dbfile, logger, thread_number): | |
| global urls | |
| global did_urls | |
| global next_deep_urls | |
| global current_deep | |
| lock = threading.RLock() | |
| lock.acquire() | |
| while True: | |
| #达到指定深度就停止 | |
| global current_deep | |
| if current_deep > deep: | |
| break | |
| global urls | |
| if [] != urls: | |
| url = urls.pop(0) | |
| #已爬取过的url不再爬取 | |
| if url not in did_urls: | |
| ##print "+++", threading.currentThread().getName(), 'did:', len(did_urls) + 1 , 'remain:',len(urls), 'deep:', current_deep, 'in:',url | |
| logger.level1("+++ " + threading.currentThread().getName() + ' did:' + str(len(did_urls) + 1) + ' remain:' + str(len(urls)) + ' deep:' + str(current_deep) + ' in:' + str(url)) | |
| #将爬取过的url保存 | |
| did_urls.append(url) | |
| #获取当前url页面中的内容 | |
| content = find_data(url, keyword, dbfile, logger) | |
| #获取当前页面内容中的所有url | |
| deep_urls = get_urls(url, content, logger) | |
| deep_urls = list(set(deep_urls)) | |
| ##print "***", url, 'have:', len(deep_urls), 'urls' | |
| logger.level4("*** " + url + ' have ' + str(len(deep_urls)) + ' urls') | |
| #将获取到的下层url保存到下层url列表中 | |
| next_deep_urls.extend(deep_urls) | |
| logger.level4("*** extended " + str(len(deep_urls)) + ' urls in next_deep_urls') | |
| #当前层的url爬取完毕 | |
| if [] == urls: | |
| #将下层url去掉重复的并提取出来 | |
| urls = list(set(next_deep_urls)) | |
| ##print "-*- next deep have:",len(urls),'urls' | |
| logger.level4("-*- next deep have " + str(len(urls)) + ' urls') | |
| #清空下层url | |
| next_deep_urls = [] | |
| #进入下一层 | |
| current_deep = current_deep + 1 | |
| ##线程池,这样写是为了避免因延时导致其他线程直接结束 | |
| #if current_deep == 2 and deep > 1: | |
| # for i in range(thread_number): | |
| # thread_name = 'spider-'+str(i+1) | |
| # spider_thread = threading.Thread(target=spider, args=(keyword, deep, dbfile, logger, thread_number)) | |
| # spider_thread.setName(thread_name) | |
| # spider_thread.start() | |
| # ##print "-+-", thread_name, "started!" | |
| # logger.level3("-+- " + thread_name + " started!") | |
| # time.sleep(10) | |
| #所有URL爬取完毕 | |
| else: | |
| break | |
| lock.release() | |
| ##print "-+-", threading.currentThread().getName(),"is over!" | |
| logger.level3("-+- " + threading.currentThread().getName() + " is over!") | |
| #日志记录 | |
| class Logger: | |
| def __init__(self, logfile, log_level, need_log): | |
| import logging | |
| LEVEL = {1 : 50, #CRITICAL | |
| 2 : 40, #ERROR | |
| 3 : 30, #WARNING | |
| 4 : 20, #INFO | |
| 5 : 10, #DEBUG | |
| 6 : 0, #NOTSET | |
| } | |
| logging.basicConfig(filename = logfile, | |
| level = LEVEL[log_level], | |
| filemode = 'w', | |
| format = '%(asctime)s *** %(message)s') | |
| self.logger = logging.getLogger('spider_logger') | |
| self.need_log = need_log | |
| def level5(self, msg): | |
| if msg: | |
| if not self.need_log: | |
| pass | |
| else: | |
| self.logger.debug(msg) | |
| def level4(self, msg): | |
| if msg: | |
| if not self.need_log: | |
| pass | |
| else: | |
| self.logger.info(msg) | |
| def level3(self, msg): | |
| if msg: | |
| if not self.need_log: | |
| pass | |
| else: | |
| self.logger.warning(msg) | |
| def level2(self, msg): | |
| if msg: | |
| if not self.need_log: | |
| pass | |
| else: | |
| self.logger.error(msg) | |
| def level1(self, msg): | |
| if msg: | |
| if not self.need_log: | |
| pass | |
| else: | |
| self.logger.critical(msg) | |
| #程序自测 | |
| def _test(): | |
| import doctest | |
| doctest.testmod() | |
| def main(): | |
| url, keyword, deep, dbfile, logfile, log_level, need_log, thread_number = get_args() | |
| threads = [] | |
| #程序自测 | |
| if self_test: | |
| _test() | |
| #日志记录 | |
| logger = Logger(logfile, log_level, need_log) | |
| if url: | |
| #启动进程定时器线程 | |
| t_p = threading.Timer(10.0, progress) | |
| t_p.setDaemon(True) | |
| t_p.start() | |
| urls.append(url) | |
| spider(keyword, deep, dbfile, logger, thread_number) | |
| #线程池 | |
| for i in range(thread_number): | |
| thread_name = 'spider-'+str(i+1) | |
| spider_thread = threading.Thread(target=spider, args=(keyword, deep, dbfile, logger, thread_number)) | |
| spider_thread.setName(thread_name) | |
| spider_thread.start() | |
| ##print "-+-", thread_name, "started!" | |
| logger.level3("-+- " + thread_name + " started!") | |
| threads.append(spider_thread) | |
| for t in threads: | |
| if t.isAlive(): | |
| t.join() | |
| else: | |
| usage() | |
| logger.level5("exit: not input the url") | |
| sys.exit() | |
| if __name__ == "__main__": | |
| main() | |
| db = DB(dbfile, logger) | |
| db.show_info(keyword) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment