url.txt 格式见 url.txt~
代码有一定的问题。爬去www.qq.com 会出错,原因是网页编码的问题,代码里,直接是读取网页的编码,在用这个编码进行解码。 有的网页是没有设置编码的,所以导致了读取不到编码从而是乱码问题。
| #!/usr/bin/env python3 | |
| # -*- coding : utf-8 -*- | |
| # author: tennc | |
| # date: 2016/3/20 | |
| # filename: paer.py | |
| # 检测一推二级域名200状态,并且爬出名称保存 | |
| # url.txt 为二级域名保存文件,save.txt为结果文件。 | |
| # The MIT License | |
| # NAME | |
| # Copyright (c) 2015 | |
| # | |
| # Permission is hereby granted, free of charge, to any person obtaining a | |
| # copy of this software and associated documentation files (the "Software"), | |
| # to deal in the Software without restriction, including without limitation | |
| # the rights to use, copy, modify, merge, publish, distribute, sublicense, | |
| # and/or sell copies of the Software, and to permit persons to whom the | |
| # Software is furnished to do so, subject to the following conditions: | |
| # | |
| # The above copyright notice and this permission notice shall be included in | |
| # all copies or substantial portions of the Software. | |
| # | |
| # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
| # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
| # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |
| # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
| # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING | |
| # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER | |
| # DEALINGS IN THE SOFTWARE. | |
| import requests | |
| from bs4 import BeautifulSoup | |
| url = open("url.txt") | |
| saveurl = open('save.txt', "w+") | |
| for i in url.readlines(): | |
| i = i.strip() | |
| try: | |
| r = requests.get(i) | |
| #print(r.status_code) | |
| if r.status_code == 200: | |
| encodin = r.encoding | |
| issue = BeautifulSoup(r.text.encode(encoding=encodin), "html.parser") | |
| titlename = issue.title.string | |
| print(titlename,i,"\n") | |
| saveurl.writelines(str(titlename) + "," + str(i) + ''+ '\n') | |
| else: | |
| pass | |
| except Exception as e: | |
| pass | |
| saveurl.closed |
| http://www.qq.com | |
| http://ww1.qq.com | |
| http://lol.qq.com | |
| http://110.qq.com | |
| http://guanjia.qq.com |