Skip to content

Instantly share code, notes, and snippets.

@wangx2
Created May 26, 2016 13:44
Show Gist options
  • Save wangx2/8526829325554b5e3b02f3247066dbb7 to your computer and use it in GitHub Desktop.
Save wangx2/8526829325554b5e3b02f3247066dbb7 to your computer and use it in GitHub Desktop.
python爬虫抓取美女图片
#! /usr/bin/python3
# -*- coding:utf-8 -*-
import requests
import re
from bs4 import BeautifulSoup
import time
class Mymm():
def __init__(self, url):
self.url = url
header = {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
'Referer':'http://www.souutu.com/mnmm/ ',
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Encoding':'gzip, deflate, sdch',
'Connection':'keep-alive',
'Cache-Control':'max-age=0',
'Host':'www.souutu.com',
'Upgrade-Insecure-Requests':'1'}
index = requests.get(self.url, headers=header) # 获取网页源码
time.sleep(4)
bs = BeautifulSoup(index.content, 'html5lib')
load_img = bs.find(attrs={'class':'sumlist'})
g_num = load_img.find_all('a')
num = re.search(r'\d{3}', g_num[-1].text).group() # 得到一共有多少个页面
for page in range(1, int(num) + 1):
if page == 1:
load_img = bs.find(id='load-img')
ul_img = load_img.find_all(attrs={'class':'timg'})
# 将得到的连接跟图片的张数
for link in ul_img:
href = link.find('a').get('href')
url = re.findall(r'(http://www.souutu.com/mnmm/.+)', href)
img_num = re.findall(r'([1-9])', link.find('span').text)
self.get_link(''.join(url), ''.join(img_num))
else:
page_url = r'http://www.souutu.com/mnmm/index_' + str(page) + '.html' # 根据页数生成页面URL
page_index = requests.get(page_url, headers=header)
bs = BeautifulSoup(page_index.content, 'html5lib')
load_img = bs.find(id=r'load-img')
ul_img = load_img.find_all(attrs={'class':'timg'})
url = []
img_num = []
for link in ul_img:
href = link.find('a').get('href')
url = re.findall(r'(http://www.souutu.com/mnmm/.+)', href)
img_num = re.findall(r'([1-9])', link.find('span').text)
self.get_link(''.join(url), ''.join(img_num)) # 将url跟图片的张数转换成str,方便传入dict
def get_link(self, url, img_num):
self.url = url
self.img_num = img_num
dict = {self.url:self.img_num} # 将url跟图片张数一一对应
page = 0
for k, v in dict.items():
content = requests.get(k).content
time.sleep(4)
bs = BeautifulSoup(content, 'html5lib')
getimg_all = bs.find(attrs={'class':'cl gallery'})
get_imgurl = re.findall(r'(http://img.souutu.com/\d{4}/\d{4}/)(.+)(\d{17}.jpg)', str(getimg_all))
for g in get_imgurl:
gg = g[0] + g[2]
print(gg)
self.save_img(gg)
def save_img(self, get_imgurl):
self.get_imgurl = get_imgurl
save = open('mm.txt', 'a')
for w in self.get_imgurl:
save.write(w)
save.write('\n')
save.close()
mm = Mymm(r'http://www.souutu.com/mnmm/')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment