Created
November 4, 2018 09:08
-
-
Save XiaochenCui/dc029dd018d02037e622c59086e94fa4 to your computer and use it in GitHub Desktop.
一个简单的爬虫
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
""" | |
Spyder Editor | |
This is a temporary script file. | |
""" | |
# 这个程序将会处理一个页面 | |
# 第一步:下载网页(人类:在浏览器中打开网页) | |
import requests | |
r = requests.get("https://movie.douban.com/subject/26793385/?from=showing") | |
# 第二步:分析网页(人类:看网页) | |
# 导入模块 | |
from bs4 import BeautifulSoup | |
# 使之前下载好的页面结构化 | |
soup = BeautifulSoup(r.text, 'html.parser') | |
# 找所需元素的特征 | |
# 图片:<img> 标签 | |
# 根据之前在浏览器中找到的特征提取出相应的标签 | |
tags = soup.findAll('img') | |
src = tags[0].attrs['src'] | |
print(src) | |
# 下载图片 | |
picture = requests.get(src) | |
import os | |
os.chdir("/Users/cuixiaochen/电影图片/") | |
# 保存图片 | |
with open("test.jpg", "wb") as f: | |
f.write(picture.content) | |
# 声明一个函数,它的作用是保存图片到本地,接受的输入是一个 tag | |
def save_picture(tag): | |
# | |
src = tag.attrs['src'] | |
print(src) | |
# 下载图片 | |
picture = requests.get(src) | |
# 取到文件名 | |
if 'alt' in tag.attrs: | |
name = tag.attrs['alt'] | |
print("Name is: " + name) | |
# 保存图片 | |
with open(name + ".jpg", "wb") as f: | |
f.write(picture.content) | |
# 练习一下如何使用函数 | |
import time | |
for tag in tags: | |
time.sleep(1) | |
save_picture(tag) | |
# 第三步:保存所需的信息 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment