Skip to content

Instantly share code, notes, and snippets.

@wangx2
Created May 6, 2016 15:55
Show Gist options
  • Save wangx2/eac2647c482fc1da86712db8faab6553 to your computer and use it in GitHub Desktop.
Save wangx2/eac2647c482fc1da86712db8faab6553 to your computer and use it in GitHub Desktop.
#! /usr/bin/python
# -*- coding:utf-8 -*-
import requests
from bs4 import BeautifulSoup
import re
from urllib.request import HTTPError
import time
import os
from _ast import Num
class Mymm():
def __init__(self):
page = 1
url = 'http://www.souutu.com/mnmm/index.html' # 网站第一页URL
url_page = 'http://www.souutu.com/mnmm/index_' + str(page) + '.html'
# print(url_page)
# http://www.souutu.com/mnmm/index_2.html 网站第二页URL
# 构造网站头信息
header = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36',
'Referer':'http://www.souutu.com/mnmm/',
'Host':'www.souutu.com',
'Accept:text/css':'*/*;q=0.1',
'Accept-Encoding':'gzip, deflate, sdch',
'Accept-Language':'zh-CN,zh;q=0.8',
'Connection':'keep-alive',
}
try:
if url == 'http://www.souutu.com/mnmm/index.html':
# print(url_page)
open_web = requests.get(url) # 打开网站
open_web.headers = header # 设置头信息
open_web.encoding = 'utf-8' # 设置打开网站编码
html = open_web.text
self.AnalyHtml(html, header) # 调用解析网站函数
while page <= 98:
page += 1
url_page = 'http://www.souutu.com/mnmm/index_' + str(page) + '.html'
# print(url_page)
open_web = requests.get(url_page) # 打开网站
open_web.headers = header # 设置头信息
open_web.encoding = 'utf-8' # 设置打开网站编码
html = open_web.text
self.AnalyHtml(html, header) # 调用解析网站函数
continue
except HTTPError as e:
print('网站打开错误', e)
except TimeoutError as e:
print('网站打开超时', e)
def AnalyHtml(self, html, header): # 解析网站
self.html = html
self.header = header
soup = BeautifulSoup(self.html, 'html5lib')
piclistheight = soup.find(id='piclistheight')
div = piclistheight.find_all(attrs={'class':'timg'})
link = [] # 将得到的连接放入list中
img_url = []
title = []
num = []
for it in div:
link.append(it.find('a').get('href')) # 得到所有首页链接地址
img_url.append(it.find('img').get('lazysrc')) # 得到所有封面图连接
title.append(it.find('img').get('title')) # 得到所有标题
# print(it.find('a').get('href'))
num.append(it.find('span').get_text()) # 获取内页有多少张图片
self.openContent(link, img_url, title, num, self.header) # 调用打开网站首页函数
def openContent(self, link, img_url, title, num, header):
self.link = link
self.img_url = img_url
self.title = title
self.num = num
self.header = header # __init__构造的头信息
# print(self.num)
for content_link in self.link:
content_open = requests.get(content_link)
content_open.headers = self.header
content_open.encoding = 'utf-8'
soup = BeautifulSoup(content_open.text, 'html5lib')
showimages = soup.find(id='showimages')
li = showimages.find('img').get('src')
for content_num in self.num:
pass
self.ContentImage(content_link, content_open.text, content_num, self.header)
def ContentImage(self, content_link, content_text, content_num, content_header):
self.content_link = content_link
self.content_text = content_text
self.num = content_num
self.header = content_header
print(content_link, content_num)
mm = Mymm()
if '__name' == '__main__':
print('hello')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment