Created
May 5, 2016 00:34
-
-
Save FGFW/0d64178d6abd70e3f6ecafffc8173bf7 to your computer and use it in GitHub Desktop.
python抓取性感尤物美女图.py
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| """ | |
| python抓取性感尤物美女图.py | |
| 2016年5月4日 00:51:00 codegay | |
| 参考资料: Python3学习笔记(urllib模块的使用) | |
| http://www.cnblogs.com/Lands-ljk/p/5447127.html | |
| 以下例子是python2的代码,并且用到lxml,requests 库 | |
| 我用python3标准库和正则写一个下载全站美女图的程序 | |
| 使用python来批量抓取网站图片 | |
| http://www.cnblogs.com/TeyGao/p/5225940.html | |
| """ | |
| print("程序运行中...") | |
| import re | |
| from urllib import request | |
| import os | |
| from pprint import pprint | |
| from time import sleep | |
| rooturl="http://www.xgyw.cc/" | |
| def getclass(): | |
| rec=re.compile('''align=center\>\<a href="(/\w+/)\"\>(.+)\</a\>''') | |
| try: | |
| txt=request.urlopen(rooturl).read().decode("gbk") | |
| fl=rec.findall(txt) | |
| except: | |
| print("错误") | |
| sleep(1) | |
| print("分类:") | |
| pprint(fl) | |
| return fl | |
| fenlei=getclass()#下载所有分类下的图片 | |
| #fenlei=[getclass()[-1]]#只下载推女郎 | |
| def getpagelist(): | |
| plist=[] | |
| for f,n in fenlei: | |
| rec=re.compile('''({}page_\d+?\.html)'''.format(f)) | |
| try: | |
| txt=request.urlopen(rooturl+f).read().decode("gbk") | |
| t=sorted(set(rec.findall(txt)+[f])) | |
| plist+=t | |
| except: | |
| print("错误",e) | |
| sleep(1) | |
| #print("page_list:") | |
| #pprint(plist) | |
| return plist | |
| pagelist=getpagelist() | |
| def getalbumlist(): | |
| albumlist=[] | |
| for r in pagelist: | |
| print(rooturl+r) | |
| try: | |
| txt=request.urlopen(rooturl+r).read().decode("gbk") | |
| for x in re.findall(r'''href=(/(\w+)/(\2)\d+.html)''',txt): | |
| albumlist+=[x[0]] | |
| except: | |
| print("getalbumlist错误") | |
| sleep(1) | |
| return albumlist | |
| albumlist=getalbumlist() | |
| def getfphoto(): | |
| for r in albumlist: | |
| try: | |
| txt=request.urlopen(rooturl+r).read().decode("gbk") | |
| result=re.findall(r'''(/(\w+)/(\2)\d+_?\d*.html)''',txt) | |
| print(t) | |
| except: | |
| sleep(1) | |
| pass | |
| for x in result: | |
| try: | |
| html=request.urlopen(rooturl+x[0]).read().decode("gbk") | |
| jpgresult=re.findall('''src=\"(/uploadfile.*?\d+/\w+\.jpg)\"''',html) | |
| print(jpgresult) | |
| except: | |
| sleep(1) | |
| for h in jpgresult: | |
| try: | |
| request.urlretrieve(rooturl+h,os.path.basename(h)) | |
| except: | |
| print(3) | |
| sleep(1) | |
| getfphoto() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment