wangx2 · May 26, 2016 13:44
diff --git a/代码还需要完善。 b/代码还需要完善。
 #! /usr/bin/python3
 # -*- coding:utf-8 -*-

 import requests
 import re
 from bs4 import BeautifulSoup
 import time
 class Mymm():
    def __init__(self, url):
        self.url = url
        header = {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
                  'Referer':'http://www.souutu.com/mnmm/ ',
                  'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
                  'Accept-Encoding':'gzip, deflate, sdch',
                  'Connection':'keep-alive',
                  'Cache-Control':'max-age=0',
                  'Host':'www.souutu.com',
                  'Upgrade-Insecure-Requests':'1'}
        
        index = requests.get(self.url, headers=header)  # 获取网页源码
        time.sleep(4)   
        bs = BeautifulSoup(index.content, 'html5lib')
        load_img = bs.find(attrs={'class':'sumlist'})
        g_num = load_img.find_all('a')
        num = re.search(r'\d{3}', g_num[-1].text).group()  # 得到一共有多少个页面
        
        for page in range(1, int(num) + 1):
            if page == 1:
                load_img = bs.find(id='load-img')
                ul_img = load_img.find_all(attrs={'class':'timg'})
                # 将得到的连接跟图片的张数
                for link in ul_img:
                    href = link.find('a').get('href')
                    url = re.findall(r'(http://www.souutu.com/mnmm/.+)', href)
                    img_num = re.findall(r'([1-9])', link.find('span').text)
                    self.get_link(''.join(url), ''.join(img_num))
            else:
                page_url = r'http://www.souutu.com/mnmm/index_' + str(page) + '.html'  # 根据页数生成页面URL
                page_index = requests.get(page_url, headers=header)
                bs = BeautifulSoup(page_index.content, 'html5lib')
                load_img = bs.find(id=r'load-img')
                ul_img = load_img.find_all(attrs={'class':'timg'})
                url = []
                img_num = []
                for link in ul_img:
                    href = link.find('a').get('href')
                    url = re.findall(r'(http://www.souutu.com/mnmm/.+)', href)
                    img_num = re.findall(r'([1-9])', link.find('span').text)
                    self.get_link(''.join(url), ''.join(img_num))  # 将url跟图片的张数转换成str，方便传入dict
    def get_link(self, url, img_num):
        self.url = url
        self.img_num = img_num
        dict = {self.url:self.img_num}  # 将url跟图片张数一一对应
        page = 0
        for k, v in dict.items(): 
            content = requests.get(k).content  
            time.sleep(4)
            bs = BeautifulSoup(content, 'html5lib')
            getimg_all = bs.find(attrs={'class':'cl gallery'})
            get_imgurl = re.findall(r'(http://img.souutu.com/\d{4}/\d{4}/)(.+)(\d{17}.jpg)', str(getimg_all))
            for g in get_imgurl:
                gg = g[0] + g[2]
                print(gg)
                self.save_img(gg)
    def save_img(self, get_imgurl):
        self.get_imgurl = get_imgurl
        save = open('mm.txt', 'a')
        for w in self.get_imgurl:
            save.write(w)
        save.write('\n')
        save.close()
        
 mm = Mymm(r'http://www.souutu.com/mnmm/')
	#! /usr/bin/python3
	# -- coding:utf-8 --

	import requests
	import re
	from bs4 import BeautifulSoup
	import time
	class Mymm():
	def __init__(self, url):
	self.url = url
	header = {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
	'Referer':'http://www.souutu.com/mnmm/ ',
	'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,/;q=0.8',
	'Accept-Encoding':'gzip, deflate, sdch',
	'Connection':'keep-alive',
	'Cache-Control':'max-age=0',
	'Host':'www.souutu.com',
	'Upgrade-Insecure-Requests':'1'}

	index = requests.get(self.url, headers=header) # 获取网页源码
	time.sleep(4)
	bs = BeautifulSoup(index.content, 'html5lib')
	load_img = bs.find(attrs={'class':'sumlist'})
	g_num = load_img.find_all('a')
	num = re.search(r'\d{3}', g_num[-1].text).group() # 得到一共有多少个页面

	for page in range(1, int(num) + 1):
	if page == 1:
	load_img = bs.find(id='load-img')
	ul_img = load_img.find_all(attrs={'class':'timg'})
	# 将得到的连接跟图片的张数
	for link in ul_img:
	href = link.find('a').get('href')
	url = re.findall(r'(http://www.souutu.com/mnmm/.+)', href)
	img_num = re.findall(r'([1-9])', link.find('span').text)
	self.get_link(''.join(url), ''.join(img_num))
	else:
	page_url = r'http://www.souutu.com/mnmm/index_' + str(page) + '.html' # 根据页数生成页面URL
	page_index = requests.get(page_url, headers=header)
	bs = BeautifulSoup(page_index.content, 'html5lib')
	load_img = bs.find(id=r'load-img')
	ul_img = load_img.find_all(attrs={'class':'timg'})
	url = []
	img_num = []
	for link in ul_img:
	href = link.find('a').get('href')
	url = re.findall(r'(http://www.souutu.com/mnmm/.+)', href)
	img_num = re.findall(r'([1-9])', link.find('span').text)
	self.get_link(''.join(url), ''.join(img_num)) # 将url跟图片的张数转换成str，方便传入dict
	def get_link(self, url, img_num):
	self.url = url
	self.img_num = img_num
	dict = {self.url:self.img_num} # 将url跟图片张数一一对应
	page = 0
	for k, v in dict.items():
	content = requests.get(k).content
	time.sleep(4)
	bs = BeautifulSoup(content, 'html5lib')
	getimg_all = bs.find(attrs={'class':'cl gallery'})
	get_imgurl = re.findall(r'(http://img.souutu.com/\d{4}/\d{4}/)(.+)(\d{17}.jpg)', str(getimg_all))
	for g in get_imgurl:
	gg = g[0] + g[2]
	print(gg)
	self.save_img(gg)
	def save_img(self, get_imgurl):
	self.get_imgurl = get_imgurl
	save = open('mm.txt', 'a')
	for w in self.get_imgurl:
	save.write(w)
	save.write('\n')
	save.close()

	mm = Mymm(r'http://www.souutu.com/mnmm/')
No results found