Created
March 21, 2017 04:45
-
-
Save forthxu/7703ee5b3f17a3e031f8fd50fc5ba857 to your computer and use it in GitHub Desktop.
Python全站内容抓取、URL扫描 实例:用于扫描自己的网站的违禁词连接和对应的违禁词
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
# -*- coding:utf-8 -*- | |
# Orignal Author: FrankHacker | |
# Modified by Linson @691000737 | |
# Require requests module, python3.x+ | |
# 处理常见违禁词,全站扫描,违禁词可以从自己的网站导出成json格式,替换13行内容 | |
# 程序会在当前目录生成badword.txt文件 | |
import requests | |
import re,json | |
#要扫描的初始网址 | |
url = "http://www.baidu.com" | |
#-----------违禁词处理 START----------- | |
d=json.loads('[{"bid":"1","replacefrom":"\u7eaf\u5929\u7136","replaceto":"","deny":"0"}, {"bid":"2","replacefrom":"\u6539\u5584\u7761\u7720","replaceto":"","deny":"0"}, {"bid":"3","replacefrom":"\u56fd\u5bb6\u7ea7","replaceto":"","deny":"0"}, {"bid":"4","replacefrom":"\u4e16\u754c\u7ea7","replaceto":"","deny":"0"}, {"bid":"5","replacefrom":"\u6700\u9ad8\u7ea7","replaceto":"","deny":"0"}, {"bid":"6","replacefrom":"\u6700\u4f73","replaceto":"","deny":"0"}, {"bid":"7","replacefrom":"\u7b2c\u4e00","replaceto":"","deny":"0"}, {"bid":"9","replacefrom":"\u9996\u4e2a","replaceto":"","deny":"0"}, {"bid":"10","replacefrom":"\u6700\u597d","replaceto":"","deny":"0"}, {"bid":"11","replacefrom":"\u7cbe\u786e","replaceto":"","deny":"0"}, {"bid":"12","replacefrom":"\u9876\u7ea7","replaceto":"","deny":"0"}, {"bid":"13","replacefrom":"\u6700\u4f4e","replaceto":"","deny":"0"}, {"bid":"14","replacefrom":"\u6700\u5e95","replaceto":"","deny":"0"}, {"bid":"15","replacefrom":"\u6700","replaceto":"","deny":"0"}, {"bid":"16","replacefrom":"\u6700\u4fbf\u5b9c","replaceto":"","deny":"0"}, {"bid":"17","replacefrom":"\u6700\u5927\u7a0b\u5ea6","replaceto":"","deny":"0"}, {"bid":"18","replacefrom":"\u6700\u65b0\u6280\u672f","replaceto":"","deny":"0"}, {"bid":"19","replacefrom":"\u6700\u5148\u8fdb\u79d1\u5b66","replaceto":"","deny":"0"}, {"bid":"20","replacefrom":"\u56fd\u5bb6\u7ea7\u4ea7\u54c1","replaceto":"","deny":"0"}, {"bid":"21","replacefrom":"\u586b\u8865\u56fd\u5185\u7a7a\u767d","replaceto":"","deny":"0"}, {"bid":"22","replacefrom":"\u7edd\u5bf9","replaceto":"","deny":"0"}, {"bid":"23","replacefrom":"\u72ec\u5bb6","replaceto":"","deny":"0"}, {"bid":"24","replacefrom":"\u9996\u5bb6","replaceto":"","deny":"0"}, {"bid":"25","replacefrom":"\u6700\u65b0","replaceto":"","deny":"0"}, {"bid":"26","replacefrom":"\u6700\u5148\u8fdb","replaceto":"","deny":"0"}, {"bid":"27","replacefrom":"\u7b2c\u4e00\u54c1\u724c","replaceto":"","deny":"0"}, {"bid":"28","replacefrom":"\u91d1\u724c","replaceto":"","deny":"0"}, {"bid":"29","replacefrom":"\u540d\u724c","replaceto":"","deny":"0"}, {"bid":"30","replacefrom":"\u6700\u8d5a","replaceto":"","deny":"0"}, {"bid":"31","replacefrom":"\u8d85\u8d5a","replaceto":"","deny":"0"}, {"bid":"32","replacefrom":"\u6700\u5148","replaceto":"","deny":"0"}, {"bid":"33","replacefrom":"\u5de8\u661f","replaceto":"","deny":"0"}, {"bid":"34","replacefrom":"\u5962\u4f88","replaceto":"","deny":"0"}, {"bid":"35","replacefrom":"\u81f3\u5c0a","replaceto":"","deny":"0"}, {"bid":"36","replacefrom":"\u9876\u7ea7\u4eab\u53d7","replaceto":"","deny":"0"}, {"bid":"37","replacefrom":"\u56fd\u5bb6","replaceto":"","deny":"0"}, {"bid":"38","replacefrom":"\u8d28\u91cf\u514d\u68c0","replaceto":"","deny":"0"}, {"bid":"39","replacefrom":"\u65e0\u9700\u56fd\u5bb6\u8d28\u91cf\u68c0\u6d4b","replaceto":"","deny":"0"}, {"bid":"40","replacefrom":"\u514d\u62bd\u68c0","replaceto":"","deny":"0"}, {"bid":"41","replacefrom":"\u6700\u9ad8","replaceto":"","deny":"0"}, {"bid":"42","replacefrom":"\u5bb6\u5ead\u5fc5\u5907","replaceto":"","deny":"0"}, {"bid":"43","replacefrom":"\u795e\u4e39","replaceto":"","deny":"0"}, {"bid":"44","replacefrom":"\u6291\u5236","replaceto":"","deny":"0"}, {"bid":"45","replacefrom":"\u5947\u6548","replaceto":"","deny":"0"}, {"bid":"46","replacefrom":"\u795e\u6548","replaceto":"","deny":"0"}, {"bid":"47","replacefrom":"\u6700\u5148","replaceto":"","deny":"0"}, {"bid":"48","replacefrom":"\u56fd\u5bb6\u673a\u5173","replaceto":"","deny":"0"}, {"bid":"49","replacefrom":"\u9ad8\u6548","replaceto":"","deny":"0"}, {"bid":"50","replacefrom":"\u8d85\u8d5a","replaceto":"","deny":"0"}, {"bid":"51","replacefrom":"\u7956\u4f20","replaceto":"","deny":"0"}, {"bid":"52","replacefrom":"\u6700\u4f73","replaceto":"","deny":"0"}, {"bid":"53","replacefrom":"\u6700\u8584","replaceto":"","deny":"0"}, {"bid":"54","replacefrom":"\u5408\u9a70\u540d\u5546\u6807","replaceto":"","deny":"0"}, {"bid":"55","replacefrom":"\u795b\u7600","replaceto":"","deny":"0"}, {"bid":"56","replacefrom":"\u901f\u6548","replaceto":"","deny":"0"}, {"bid":"57","replacefrom":"\u56fd\u5bb6\u514d\u68c0","replaceto":"","deny":"0"}, {"bid":"58","replacefrom":"\u795e\u4ed9","replaceto":"","deny":"0"}, {"bid":"59","replacefrom":"\u4e2d\u56fd\u9a70\u540d","replaceto":"","deny":"0"}, {"bid":"60","replacefrom":"\u6700\u9002","replaceto":"","deny":"0"}, {"bid":"61","replacefrom":"\u8d85\u5f3a","replaceto":"","deny":"0"}, {"bid":"62","replacefrom":"\u6700\u7f8e","replaceto":"","deny":"0"}, {"bid":"63","replacefrom":"\u7279\u6548","replaceto":"","deny":"0"}, {"bid":"64","replacefrom":"\u5962\u4f88","replaceto":"","deny":"0"}, {"bid":"65","replacefrom":"\u6700\u5c0f","replaceto":"","deny":"0"}, {"bid":"66","replacefrom":"\u79d8\u5236","replaceto":"","deny":"0"}, {"bid":"67","replacefrom":"\u6700\u65b0\u6280\u672f","replaceto":"","deny":"0"}, {"bid":"68","replacefrom":"\u6700\u4fbf\u5b9c","replaceto":"","deny":"0"}, {"bid":"69","replacefrom":"100%","replaceto":"","deny":"0"}, {"bid":"70","replacefrom":"\u6700\u6d41\u884c","replaceto":"","deny":"0"}, {"bid":"71","replacefrom":"\u7ec8\u6781","replaceto":"","deny":"0"}, {"bid":"72","replacefrom":"\u51a0\u519b","replaceto":"","deny":"0"}, {"bid":"73","replacefrom":"\u524d\u65e0\u53e4\u4eba","replaceto":"","deny":"0"}, {"bid":"74","replacefrom":"\u6781\u81f4","replaceto":"","deny":"0"}, {"bid":"75","replacefrom":"\u65f6\u5c1a\u6700\u4f4e\u4ef7","replaceto":"","deny":"0"}, {"bid":"76","replacefrom":"\u6700\u5927","replaceto":"","deny":"0"}, {"bid":"77","replacefrom":"\u5168\u56fd\u9500\u91cf\u51a0\u519b","replaceto":"","deny":"0"}, {"bid":"78","replacefrom":"\uff08\u9065\u9065\uff09\u9886\u5148","replaceto":"","deny":"0"}, {"bid":"79","replacefrom":"\u8d28\u91cf\u514d\u68c0","replaceto":"","deny":"0"}, {"bid":"80","replacefrom":"\u4e2d\u56fd\u9a70\u540d\u5546\u6807","replaceto":"","deny":"0"}, {"bid":"81","replacefrom":"\u9996\u4e2a","replaceto":"","deny":"0"}, {"bid":"82","replacefrom":"\u8457\u540d","replaceto":"","deny":"0"}, {"bid":"83","replacefrom":"\u9886\u5bfc\u8005","replaceto":"","deny":"0"}, {"bid":"84","replacefrom":"\u9886\u8896\u54c1\u724c","replaceto":"","deny":"0"}, {"bid":"85","replacefrom":"\u4e13\u4f9b","replaceto":"","deny":"0"}, {"bid":"86","replacefrom":"\u4ec5\u6b64\u4e00\u6b21","replaceto":"","deny":"0"}, {"bid":"87","replacefrom":"\u6700\u65f6\u5c1a","replaceto":"","deny":"0"}, {"bid":"88","replacefrom":"\u6781\u54c1","replaceto":"","deny":"0"}, {"bid":"89","replacefrom":"\u6700\u65b0\u79d1\u6280","replaceto":"","deny":"0"}, {"bid":"90","replacefrom":"\u56fd\u9645\u54c1\u8d28","replaceto":"","deny":"0"}, {"bid":"91","replacefrom":"\u8d44\u6df1","replaceto":"","deny":"0"}, {"bid":"92","replacefrom":"\u7cbe\u51c6","replaceto":"","deny":"0"}, {"bid":"93","replacefrom":"\u8d2d\u7269\u5927\u8db4","replaceto":"","deny":"0"}, {"bid":"94","replacefrom":"\u6700\u65b0\u79d1\u5b66","replaceto":"","deny":"0"}, {"bid":"95","replacefrom":"\u6700\u597d","replaceto":"","deny":"0"}, {"bid":"96","replacefrom":"\u5168\u7f51\u7b2c\u4e00","replaceto":"","deny":"0"}, {"bid":"97","replacefrom":"\u6700\u4f4e\u4ef7","replaceto":"","deny":"0"}, {"bid":"98","replacefrom":"\u4e2d\u56fd\u7b2c\u4e00","replaceto":"","deny":"0"}, {"bid":"99","replacefrom":"\u5168\u7403\u7ea7","replaceto":"","deny":"0"}, {"bid":"100","replacefrom":"\u6781\u4f73\uff08\u7edd\u4f73\/\u7edd\u5bf9\uff09","replaceto":"","deny":"0"}, {"bid":"101","replacefrom":"\u6ca1\u6709\u4ed6\u5c31","replaceto":"","deny":"0"}, {"bid":"102","replacefrom":"\u4e07\u4eba\u75af\u62a2","replaceto":"","deny":"0"}, {"bid":"103","replacefrom":"\u6700\u65b0","replaceto":"","deny":"0"}, {"bid":"104","replacefrom":"\u586b\u8865\u56fd\u5185\u7a7a\u767d","replaceto":"","deny":"0"}, {"bid":"105","replacefrom":"\u8d81\u73b0\u5728","replaceto":"","deny":"0"}, {"bid":"106","replacefrom":"\u6700\u540e\u4e00\u6ce2","replaceto":"","deny":"0"}, {"bid":"107","replacefrom":"\u6700\u5148\u8fdb\u52a0\u5de5\u5de5\u827a","replaceto":"","deny":"0"}, {"bid":"108","replacefrom":"NO.1","replaceto":"","deny":"0"}, {"bid":"109","replacefrom":"\u5de8\u661f","replaceto":"","deny":"0"}, {"bid":"110","replacefrom":"\u6700\u7b26\u5408","replaceto":"","deny":"0"}, {"bid":"111","replacefrom":"\u7279\u6548","replaceto":"","deny":"0"}, {"bid":"112","replacefrom":"\u6700\u9ad8","replaceto":"","deny":"0"}, {"bid":"113","replacefrom":"\u53f2\u65e0\u524d\u4f8b","replaceto":"","deny":"0"}, {"bid":"114","replacefrom":"\u7eaf\u5929\u7136","replaceto":"","deny":"0"}, {"bid":"115","replacefrom":"\u9876\u7ea7\u4eab\u53d7","replaceto":"","deny":"0"}, {"bid":"116","replacefrom":"\u7279\u4f9b","replaceto":"","deny":"0"}, {"bid":"117","replacefrom":"\u91d1\u724c","replaceto":"","deny":"0"}, {"bid":"118","replacefrom":"\u5168\u56fd","replaceto":"","deny":"0"}, {"bid":"119","replacefrom":"\u70b9\u51fb\u9886\u5956","replaceto":"","deny":"0"}, {"bid":"120","replacefrom":"\u6700\u4f4e\u7ea7","replaceto":"","deny":"0"}, {"bid":"121","replacefrom":"\u6700\u5962\u4f88","replaceto":"","deny":"0"}, {"bid":"122","replacefrom":"\u62a2\u75af\u4e86","replaceto":"","deny":"0"}, {"bid":"123","replacefrom":"\u70b9\u51fb\u6709\u60ca\u559c","replaceto":"","deny":"0"}, {"bid":"124","replacefrom":"\u72ec\u5bb6","replaceto":"","deny":"0"}, {"bid":"125","replacefrom":"\u6700\u4f18\u79c0","replaceto":"","deny":"0"}, {"bid":"126","replacefrom":"\u7b2c\u4e00","replaceto":"","deny":"0"}, {"bid":"127","replacefrom":"\u4e16\u754c\u7ea7","replaceto":"","deny":"0"}, {"bid":"128","replacefrom":"\u5dc5\u5cf0","replaceto":"","deny":"0"}, {"bid":"129","replacefrom":"\u4e07\u80fd","replaceto":"","deny":"0"}, {"bid":"130","replacefrom":"\u6b63\u54c1","replaceto":"","deny":"0"}, {"bid":"131","replacefrom":"\u70b9\u51fb\u8bd5\u7a7f","replaceto":"","deny":"0"}, {"bid":"132","replacefrom":"\u8d85\u8d5a","replaceto":"","deny":"0"}, {"bid":"133","replacefrom":"\u65e0\u9700\u56fd\u5bb6\u8d28\u91cf\u68c0\u6d4b","replaceto":"","deny":"0"}, {"bid":"134","replacefrom":"\u9996\u9009","replaceto":"","deny":"0"}, {"bid":"135","replacefrom":"\u62a2\u7206","replaceto":"","deny":"0"}, {"bid":"136","replacefrom":"\u5927\u724c","replaceto":"","deny":"0"}, {"bid":"137","replacefrom":"\u606d\u559c\u83b7\u5956","replaceto":"","deny":"0"}, {"bid":"138","replacefrom":"\u9519\u8fc7\u5c31\u6ca1\u673a\u4f1a\u4e86","replaceto":"","deny":"0"}, {"bid":"139","replacefrom":"\u5168\u6c11\u75af\u62a2","replaceto":"","deny":"0"}, {"bid":"140","replacefrom":"\u6700\u8d5a","replaceto":"","deny":"0"}, {"bid":"141","replacefrom":"\u771f\u76ae","replaceto":"","deny":"0"}, {"bid":"142","replacefrom":"\u6700\u9ad8\u6863","replaceto":"","deny":"0"}, {"bid":"143","replacefrom":"\u9886\u53d6\u5956\u54c1","replaceto":"","deny":"0"}, {"bid":"144","replacefrom":"\u54c1\u724c\u56e2","replaceto":"","deny":"0"}, {"bid":"145","replacefrom":"\u81f3\u5c0a","replaceto":"","deny":"0"}, {"bid":"146","replacefrom":"\u5962\u4f88","replaceto":"","deny":"0"}, {"bid":"147","replacefrom":"\u6700","replaceto":"","deny":"0"}, {"bid":"148","replacefrom":"\u5168\u6c11\u514d\u5355","replaceto":"","deny":"0"}, {"bid":"149","replacefrom":"\u5168\u7f51\u9996\u53d1","replaceto":"","deny":"0"}, {"bid":"150","replacefrom":"\u5168\u6c11\u62a2\u8d2d","replaceto":"","deny":"0"}, {"bid":"151","replacefrom":"\u4e16\u754c\u9886\u5148","replaceto":"","deny":"0"}, {"bid":"152","replacefrom":"\u518d\u4e0d\u62a2\u5c31\u6ca1\u4e86","replaceto":"","deny":"0"}, {"bid":"153","replacefrom":"\u56fd\u5bb6\u7ea7\u4ea7\u54c1","replaceto":"","deny":"0"}, {"bid":"154","replacefrom":"\u9876\u7ea7\uff08\u9876\u5c16\/\u5c16\u7aef\uff09","replaceto":"","deny":"0"}, {"bid":"155","replacefrom":"\u9886\u8896","replaceto":"","deny":"0"}, {"bid":"156","replacefrom":"\u9876\u7ea7\u5de5\u827a","replaceto":"","deny":"0"}, {"bid":"157","replacefrom":"\u6700\u7231","replaceto":"","deny":"0"}, {"bid":"158","replacefrom":"\u4e13\u5bb6\u63a8\u8350","replaceto":"","deny":"0"}, {"bid":"159","replacefrom":"\u56fd\u5bb6\u9886\u5bfc\u4eba","replaceto":"","deny":"0"}, {"bid":"160","replacefrom":"\u6700\u53d7\u6b22\u8fce","replaceto":"","deny":"0"}, {"bid":"161","replacefrom":"\u5468\u5e74\u5e86","replaceto":"","deny":"0"}, {"bid":"162","replacefrom":"\u6700\u5927\u7a0b\u5ea6","replaceto":"","deny":"0"}, {"bid":"163","replacefrom":"\u4e2d\u56fd\u9a70\u540d\uff08\u9a70\u540d\u5546\u6807\uff09","replaceto":"","deny":"0"}, {"bid":"164","replacefrom":"\u9996\u6b21","replaceto":"","deny":"0"}, {"bid":"165","replacefrom":"\u968f\u65f6\u6da8\u4ef7","replaceto":"","deny":"0"}, {"bid":"166","replacefrom":"\u6700\u5148","replaceto":"","deny":"0"}, {"bid":"167","replacefrom":"\u4e00\u6d41","replaceto":"","deny":"0"}, {"bid":"168","replacefrom":"\u5168\u56fd\u9996\u53d1","replaceto":"","deny":"0"}, {"bid":"169","replacefrom":"\u9500\u91cf\u7b2c\u4e00","replaceto":"","deny":"0"}, {"bid":"170","replacefrom":"\u56fd\u5bb6\u7ea7","replaceto":"","deny":"0"}, {"bid":"171","replacefrom":"\u65e0\u654c","replaceto":"","deny":"0"}, {"bid":"172","replacefrom":"\u4e00\u5929","replaceto":"","deny":"0"}, {"bid":"173","replacefrom":"\u4eca\u5929","replaceto":"","deny":"0"}, {"bid":"174","replacefrom":"\u4f18\u79c0","replaceto":"","deny":"0"}, {"bid":"175","replacefrom":"\u6700\u8212\u9002","replaceto":"","deny":"0"}, {"bid":"176","replacefrom":"\u7b2c\u4e00\u54c1\u724c","replaceto":"","deny":"0"}, {"bid":"177","replacefrom":"\u7cbe\u54c1\u56e2","replaceto":"","deny":"0"}, {"bid":"178","replacefrom":"\u5c31","replaceto":"","deny":"0"}, {"bid":"179","replacefrom":"\u6700\u4fbf\u5b9c","replaceto":"","deny":"0"}, {"bid":"180","replacefrom":"\u9996\u6b3e","replaceto":"","deny":"0"}, {"bid":"181","replacefrom":"\u9996\u53d1","replaceto":"","deny":"0"}, {"bid":"182","replacefrom":"\u4ec5\u9650","replaceto":"","deny":"0"}, {"bid":"183","replacefrom":"\u5012\u8ba1\u65f6","replaceto":"","deny":"0"}, {"bid":"184","replacefrom":"\u6700\u5e95","replaceto":"","deny":"0"}, {"bid":"185","replacefrom":"\u7956\u4f20","replaceto":"","deny":"0"}, {"bid":"186","replacefrom":"\u514d\u62bd\u68c0","replaceto":"","deny":"0"}, {"bid":"187","replacefrom":"\u6700\u4f4e","replaceto":"","deny":"0"}, {"bid":"188","replacefrom":"\u738b\u724c","replaceto":"","deny":"0"}, {"bid":"189","replacefrom":"\u6700\u5148\u8fdb\u79d1\u5b66","replaceto":"","deny":"0"}, {"bid":"190","replacefrom":"\u5468\u672b","replaceto":"","deny":"0"}, {"bid":"191","replacefrom":"\u6700\u5148\u8fdb","replaceto":"","deny":"0"}, {"bid":"192","replacefrom":"\u72ec\u4e00\u65e0\u4e8c","replaceto":"","deny":"0"}, {"bid":"193","replacefrom":"\u6700\u5177","replaceto":"","deny":"0"}, {"bid":"194","replacefrom":"\u6700\u5148\u4eab\u53d7","replaceto":"","deny":"0"}, {"bid":"195","replacefrom":"\u6700\u4f73","replaceto":"","deny":"0"}, {"bid":"196","replacefrom":"\u8001\u5b57\u53f7","replaceto":"","deny":"0"}, {"bid":"197","replacefrom":"\u4e4b\u738b","replaceto":"","deny":"0"}, {"bid":"198","replacefrom":"\u5b87\u5b99\u7ea7","replaceto":"","deny":"0"}, {"bid":"199","replacefrom":"\u95ea\u8d2d","replaceto":"","deny":"0"}, {"bid":"200","replacefrom":"\u5355\u54c1\u56e2","replaceto":"","deny":"0"}, {"bid":"201","replacefrom":"\u70b9\u51fb\u83b7\u53d6","replaceto":"","deny":"0"}, {"bid":"202","replacefrom":"\u738b\u8005","replaceto":"","deny":"0"}, {"bid":"203","replacefrom":"\u70b9\u51fb\u8f6c\u8eab","replaceto":"","deny":"0"}, {"bid":"204","replacefrom":"\u79d2\u6740","replaceto":"","deny":"0"}, {"bid":"205","replacefrom":"\u638c\u95e8\u4eba","replaceto":"","deny":"0"}, {"bid":"206","replacefrom":"\u7f14\u9020\u8005","replaceto":"","deny":"0"}, {"bid":"207","replacefrom":"\u9886\u5148\u4e0a\u5e02","replaceto":"","deny":"0"}, {"bid":"208","replacefrom":"\u9ad8\u7ea7","replaceto":"","deny":"0"}, {"bid":"209","replacefrom":"TOP.1","replaceto":"","deny":"0"}, {"bid":"210","replacefrom":"\u9a6c\u4e0a\u964d\u4ef7\u4eca\u65e5","replaceto":"","deny":"0"}, {"bid":"211","replacefrom":"\u72ec\u5bb6\u914d\u65b9","replaceto":"","deny":"0"}, {"bid":"212","replacefrom":"\u6700\u540e","replaceto":"","deny":"0"}, {"bid":"213","replacefrom":"\u6700\u4f18","replaceto":"","deny":"0"}, {"bid":"214","replacefrom":"\u7279\u60e0\u8db4","replaceto":"","deny":"0"}, {"bid":"215","replacefrom":"\u9ad8\u6863","replaceto":"","deny":"0"}, {"bid":"216","replacefrom":"\u540d\u724c","replaceto":"","deny":"0"}, {"bid":"217","replacefrom":"\u6c38\u4e45","replaceto":"","deny":"0"}, {"bid":"218","replacefrom":"\u968f\u65f6\u7ed3\u675f","replaceto":"","deny":"0"}, {"bid":"219","replacefrom":"\u56fd\u5bb6(\u56fd\u5bb6\u514d\u68c0\uff09","replaceto":"","deny":"0"}, {"bid":"220","replacefrom":"\u70b9\u51fb\u7ffb\u8f6c","replaceto":"","deny":"0"}, {"bid":"221","replacefrom":"\u521b\u9886\u54c1\u724c","replaceto":"","deny":"0"}, {"bid":"222","replacefrom":"\u6700\u9ad8\u7ea7","replaceto":"","deny":"0"}, {"bid":"223","replacefrom":"\u51e0\u5929\u51e0\u591c","replaceto":"","deny":"0"}, {"bid":"224","replacefrom":"\u5356\u75af\u4e86","replaceto":"","deny":"0"}, {"bid":"225","replacefrom":"\u6392\u540d\u7b2c\u4e00","replaceto":"","deny":"0"}, {"bid":"226","replacefrom":"\u5168\u56fd\u7b2c\u4e00","replaceto":"","deny":"0"}, {"bid":"227","replacefrom":"\u72ec\u5bb6","replaceto":"","deny":"0"}, {"bid":"228","replacefrom":"\u6700\u805a\u62e2","replaceto":"","deny":"0"}]') | |
data=[] | |
for i in d: | |
data.append(i['replacefrom']) | |
def handler(url,text): | |
''' | |
针对每个页面url和text(抓取的文本内容) 进行处理 | |
''' | |
global data | |
bad_f=open('./badword.txt','a+') | |
for i in data: | |
if i in text: | |
bad_f.write(url+":"+i+"\n") | |
bad_f.close() | |
#----------违禁词处理 END------------ | |
def url_protocol(url): | |
''' | |
获取输入的url地址的协议,是http、https等 | |
''' | |
return re.findall(r'.*(?=://)',url)[0] | |
urlprotocol = url_protocol(url) | |
def same_url(url): | |
''' | |
处理用户输入的url,并为后续判断是否为一个站点的url做准备,爬取的时候不能爬到其它站,那么爬取将无止境 | |
:return: sameurl | |
''' | |
#将完整的url中的http://删除 | |
url = url.replace(urlprotocol + '://','') | |
#判断删除http://之后的url有没有www,如果没有就加上‘www.’,但不存储, | |
#只是为了同化所有将要处理的url,都有了‘www.’之后, | |
#就可以找以‘www.’开始的到第一个‘/’结束中的所有字符串作为该站的主域名 | |
if re.findall(r'^www',url) == []: | |
sameurl = 'www.' + url | |
if sameurl.find('/') != -1: | |
sameurl = re.findall(r'(?<=www.).*?(?=/)', sameurl)[0] | |
else: | |
sameurl = sameurl + '/' | |
sameurl = re.findall(r'(?<=www.).*?(?=/)', sameurl)[0] | |
else: | |
if url.find('/') != -1: | |
sameurl = re.findall(r'(?<=www.).*?(?=/)', url)[0] | |
else: | |
sameurl = url + '/' | |
sameurl = re.findall(r'(?<=www.).*?(?=/)', sameurl)[0] | |
return sameurl | |
domain_url = same_url(url) | |
''' | |
处理url的类,对已访问过的和未访问过的进行记录,待后续使用 | |
''' | |
class linkQuence: | |
def __init__(self): | |
self.visited = [] #已访问过的url初始化列表 | |
self.unvisited = [] #未访问过的url初始化列表 | |
def getVisitedUrl(self): #获取已访问过的url | |
return self.visited | |
def getUnvisitedUrl(self): #获取未访问过的url | |
return self.unvisited | |
def addVisitedUrl(self,url): #添加已访问过的url | |
return self.visited.append(url) | |
def addUnvisitedUrl(self,url): #添加未访问过的url | |
if url != '' and url not in self.visited and url not in self.unvisited: | |
return self.unvisited.insert(0,url) | |
def removeVisited(self,url): | |
return self.visited.remove(url) | |
def popUnvisitedUrl(self): #从未访问过的url中取出一个url | |
try: #pop动作会报错终止操作,所以需要使用try进行异常处理 | |
return self.unvisited.pop() | |
except: | |
return None | |
def unvisitedUrlEmpty(self): #判断未访问过列表是不是为空 | |
return len(self.unvisited) == 0 | |
class Spider(): | |
''' | |
真正的爬取程序 | |
''' | |
def __init__(self,url): | |
self.linkQuence = linkQuence() #引入linkQuence类 | |
self.linkQuence.addUnvisitedUrl(url) #并将需要爬取的url添加进linkQuence对列中 | |
self.current_deepth = 1 #设置爬取的深度 | |
''' | |
这里需要注意: | |
爬取分为:***先深度后广度***和***先广度和后深度*** | |
1、如果是先深度后广度,那么给定一个url,然后从其页面中的任意一个可用链接进行深度爬取,很可能无限至循环下去 | |
(在处理不当的时候,但一般情况下大家都会处理的很好,无非是判断哪些url是已经爬取过的,哪些是新爬取到的url) | |
2、如果是先广度后深度,就是将一个url页面中的所有链接进行爬取,然后分类处理过滤 | |
(在这种处理不当的时候也会出现无限循环的可能,但一般情况下大家都会处理的很好,无非是判断哪些url是已经爬取过的,哪些是新爬取到的url) | |
''' | |
def getPageLinks(self,url): | |
''' | |
获取页面中的所有链接 | |
''' | |
try: | |
pageSource = requests.get(url).text | |
pageLinks = re.findall(r'(?<=href=\").*?(?=\")|(?<=href=\').*?(?=\')',pageSource) | |
#处理违禁词 | |
handler(url,pageSource) | |
return pageLinks | |
except: | |
return [] | |
def processUrl(self,url): | |
''' | |
判断正确的链接及处理相对路径为正确的完整url | |
:return: | |
''' | |
true_url = [] | |
for l in self.getPageLinks(url): | |
if re.findall(r'/',l): | |
if re.findall(r':',l): | |
true_url.append(l) | |
else: | |
true_url.append(urlprotocol + '://' + domain_url + l) | |
return true_url | |
def sameTargetUrl(self,url): | |
''' | |
判断是否为同一站点链接,防止爬出站外,然后导致无限尝试爬取 | |
''' | |
same_target_url = [] | |
for l in self.processUrl(url): | |
if re.findall(domain_url,l): | |
same_target_url.append(l) | |
return same_target_url | |
def unrepectUrl(self,url): | |
''' | |
删除重复url | |
''' | |
unrepect_url = [] | |
for l in self.sameTargetUrl(url): | |
if l not in unrepect_url: | |
unrepect_url.append(l) | |
return unrepect_url | |
def crawler(self,crawl_deepth=1): | |
''' | |
正式的爬取,并依据深度进行爬取层级控制 | |
''' | |
while self.current_deepth <= crawl_deepth: | |
while not self.linkQuence.unvisitedUrlEmpty(): | |
visitedUrl = self.linkQuence.popUnvisitedUrl() | |
#print(visitedUrl) | |
if visitedUrl is None or visitedUrl == '': | |
continue | |
links = self.unrepectUrl(visitedUrl) | |
self.linkQuence.addVisitedUrl(visitedUrl) | |
for link in links: | |
self.linkQuence.addUnvisitedUrl(link) | |
self.current_deepth += 1 | |
return self.linkQuence.visited | |
if __name__ == '__main__': | |
spider = Spider(url) | |
spider.crawler(1) #传递1就好了 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment