-
-
Save wangx2/eac2647c482fc1da86712db8faab6553 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /usr/bin/python | |
# -*- coding:utf-8 -*- | |
import requests | |
from bs4 import BeautifulSoup | |
import re | |
from urllib.request import HTTPError | |
import time | |
import os | |
from _ast import Num | |
class Mymm(): | |
def __init__(self): | |
page = 1 | |
url = 'http://www.souutu.com/mnmm/index.html' # 网站第一页URL | |
url_page = 'http://www.souutu.com/mnmm/index_' + str(page) + '.html' | |
# print(url_page) | |
# http://www.souutu.com/mnmm/index_2.html 网站第二页URL | |
# 构造网站头信息 | |
header = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36', | |
'Referer':'http://www.souutu.com/mnmm/', | |
'Host':'www.souutu.com', | |
'Accept:text/css':'*/*;q=0.1', | |
'Accept-Encoding':'gzip, deflate, sdch', | |
'Accept-Language':'zh-CN,zh;q=0.8', | |
'Connection':'keep-alive', | |
} | |
try: | |
if url == 'http://www.souutu.com/mnmm/index.html': | |
# print(url_page) | |
open_web = requests.get(url) # 打开网站 | |
open_web.headers = header # 设置头信息 | |
open_web.encoding = 'utf-8' # 设置打开网站编码 | |
html = open_web.text | |
self.AnalyHtml(html, header) # 调用解析网站函数 | |
while page <= 98: | |
page += 1 | |
url_page = 'http://www.souutu.com/mnmm/index_' + str(page) + '.html' | |
# print(url_page) | |
open_web = requests.get(url_page) # 打开网站 | |
open_web.headers = header # 设置头信息 | |
open_web.encoding = 'utf-8' # 设置打开网站编码 | |
html = open_web.text | |
self.AnalyHtml(html, header) # 调用解析网站函数 | |
continue | |
except HTTPError as e: | |
print('网站打开错误', e) | |
except TimeoutError as e: | |
print('网站打开超时', e) | |
def AnalyHtml(self, html, header): # 解析网站 | |
self.html = html | |
self.header = header | |
soup = BeautifulSoup(self.html, 'html5lib') | |
piclistheight = soup.find(id='piclistheight') | |
div = piclistheight.find_all(attrs={'class':'timg'}) | |
link = [] # 将得到的连接放入list中 | |
img_url = [] | |
title = [] | |
num = [] | |
for it in div: | |
link.append(it.find('a').get('href')) # 得到所有首页链接地址 | |
img_url.append(it.find('img').get('lazysrc')) # 得到所有封面图连接 | |
title.append(it.find('img').get('title')) # 得到所有标题 | |
# print(it.find('a').get('href')) | |
num.append(it.find('span').get_text()) # 获取内页有多少张图片 | |
self.openContent(link, img_url, title, num, self.header) # 调用打开网站首页函数 | |
def openContent(self, link, img_url, title, num, header): | |
self.link = link | |
self.img_url = img_url | |
self.title = title | |
self.num = num | |
self.header = header # __init__构造的头信息 | |
# print(self.num) | |
for content_link in self.link: | |
content_open = requests.get(content_link) | |
content_open.headers = self.header | |
content_open.encoding = 'utf-8' | |
soup = BeautifulSoup(content_open.text, 'html5lib') | |
showimages = soup.find(id='showimages') | |
li = showimages.find('img').get('src') | |
for content_num in self.num: | |
pass | |
self.ContentImage(content_link, content_open.text, content_num, self.header) | |
def ContentImage(self, content_link, content_text, content_num, content_header): | |
self.content_link = content_link | |
self.content_text = content_text | |
self.num = content_num | |
self.header = content_header | |
print(content_link, content_num) | |
mm = Mymm() | |
if '__name' == '__main__': | |
print('hello') | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment