Created
May 26, 2016 13:44
-
-
Save wangx2/8526829325554b5e3b02f3247066dbb7 to your computer and use it in GitHub Desktop.
python爬虫抓取美女图片
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /usr/bin/python3 | |
# -*- coding:utf-8 -*- | |
import requests | |
import re | |
from bs4 import BeautifulSoup | |
import time | |
class Mymm(): | |
def __init__(self, url): | |
self.url = url | |
header = {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50', | |
'Referer':'http://www.souutu.com/mnmm/ ', | |
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', | |
'Accept-Encoding':'gzip, deflate, sdch', | |
'Connection':'keep-alive', | |
'Cache-Control':'max-age=0', | |
'Host':'www.souutu.com', | |
'Upgrade-Insecure-Requests':'1'} | |
index = requests.get(self.url, headers=header) # 获取网页源码 | |
time.sleep(4) | |
bs = BeautifulSoup(index.content, 'html5lib') | |
load_img = bs.find(attrs={'class':'sumlist'}) | |
g_num = load_img.find_all('a') | |
num = re.search(r'\d{3}', g_num[-1].text).group() # 得到一共有多少个页面 | |
for page in range(1, int(num) + 1): | |
if page == 1: | |
load_img = bs.find(id='load-img') | |
ul_img = load_img.find_all(attrs={'class':'timg'}) | |
# 将得到的连接跟图片的张数 | |
for link in ul_img: | |
href = link.find('a').get('href') | |
url = re.findall(r'(http://www.souutu.com/mnmm/.+)', href) | |
img_num = re.findall(r'([1-9])', link.find('span').text) | |
self.get_link(''.join(url), ''.join(img_num)) | |
else: | |
page_url = r'http://www.souutu.com/mnmm/index_' + str(page) + '.html' # 根据页数生成页面URL | |
page_index = requests.get(page_url, headers=header) | |
bs = BeautifulSoup(page_index.content, 'html5lib') | |
load_img = bs.find(id=r'load-img') | |
ul_img = load_img.find_all(attrs={'class':'timg'}) | |
url = [] | |
img_num = [] | |
for link in ul_img: | |
href = link.find('a').get('href') | |
url = re.findall(r'(http://www.souutu.com/mnmm/.+)', href) | |
img_num = re.findall(r'([1-9])', link.find('span').text) | |
self.get_link(''.join(url), ''.join(img_num)) # 将url跟图片的张数转换成str,方便传入dict | |
def get_link(self, url, img_num): | |
self.url = url | |
self.img_num = img_num | |
dict = {self.url:self.img_num} # 将url跟图片张数一一对应 | |
page = 0 | |
for k, v in dict.items(): | |
content = requests.get(k).content | |
time.sleep(4) | |
bs = BeautifulSoup(content, 'html5lib') | |
getimg_all = bs.find(attrs={'class':'cl gallery'}) | |
get_imgurl = re.findall(r'(http://img.souutu.com/\d{4}/\d{4}/)(.+)(\d{17}.jpg)', str(getimg_all)) | |
for g in get_imgurl: | |
gg = g[0] + g[2] | |
print(gg) | |
self.save_img(gg) | |
def save_img(self, get_imgurl): | |
self.get_imgurl = get_imgurl | |
save = open('mm.txt', 'a') | |
for w in self.get_imgurl: | |
save.write(w) | |
save.write('\n') | |
save.close() | |
mm = Mymm(r'http://www.souutu.com/mnmm/') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment