Created
September 10, 2017 16:29
-
-
Save mpco/0074c65bd634a44832c6785f11127752 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# -*- coding: UTF-8 -*- | |
from selenium import webdriver | |
from selenium.webdriver.common.by import By | |
from selenium.webdriver.support.ui import WebDriverWait | |
from selenium.webdriver.support import expected_conditions as EC | |
from urllib import request | |
from bs4 import BeautifulSoup | |
import re | |
import time | |
import sys | |
import os | |
# 加载webdriver路径 | |
dirPath = os.path.dirname(os.path.realpath(__file__)) | |
os.environ['PATH'] += (":" + dirPath) | |
# 打开网页 | |
url = sys.argv[1] | |
answerID = url.split("/")[-1] | |
driver = webdriver.Chrome() | |
driver.get(url) | |
# 如果需要登录 | |
try: | |
assert "- 知乎" in driver.title | |
except AssertionError: | |
assert "与世界分享你的知识、经验和见解" in driver.title | |
signinButton = driver.find_element_by_xpath("//a[@href='#signin']") | |
signinButton.click() | |
input("请登录\n回车键以继续...") | |
assert "- 知乎" in driver.title | |
# 加载出所有答案 | |
print("网页加载中...") | |
while True: | |
# 等待加载 | |
time.sleep(5) | |
# 拉到网页底部 | |
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") | |
try: | |
moreAnswerButton = WebDriverWait(driver, 30).until( | |
EC.presence_of_element_located((By.XPATH, "//button[@class='Button QuestionMainAction']"))) | |
moreAnswerButton.click() | |
except Exception as e: | |
# print(e) | |
break | |
input("回车 以开始处理网页....") | |
# 处理网页 | |
html = driver.page_source | |
bsObj = BeautifulSoup(html, 'html.parser') | |
# 处理已加载的图片链接 | |
dataList_1 = bsObj.findAll(name='img', attrs={ | |
'data-rawwidth': re.compile(r'\d{0,4}'), 'data-original': re.compile(r'https://')}) | |
# 处理未加载的图片链接 | |
dataList_2 = bsObj.findAll(name='div', attrs={ | |
'class': "VagueImage origin_image zh-lightbox-thumb", 'data-src': re.compile(r'https://')}) | |
urlList_1 = [data.attrs['data-original'] for data in dataList_1] | |
urlList_1.extend([data.attrs['data-src'] for data in dataList_2]) | |
# 以 _r 结尾的图片链接是原版的图片 | |
urlList = [urlItem.replace("_b", "_r") for urlItem in urlList_1] | |
# 保存的文件夹 | |
dirName = os.path.join(os.path.dirname(os.path.realpath(__file__)), time.strftime("%Y%m%d") + "-id" + answerID) | |
os.mkdir(dirName) | |
# 保存图片链接到文件 | |
linkFilePath = os.path.join(dirName, "imgLink-id" + answerID + ".txt") | |
f = open(linkFilePath, 'w') | |
for index, imgUrl in enumerate(urlList): | |
f.write(imgUrl + "\n") | |
print("-----" + str(index + 1) + ":" + imgUrl + "-------") | |
f.close() | |
input("回车 以继续下载图片...") | |
driver.close() | |
# 下载图片 | |
for index, imgUrl in enumerate(urlList): | |
_, file_extension = os.path.splitext(imgUrl) | |
with open(dirName + '/' + str(index + 1) + file_extension, 'wb') as w: | |
w.write(request.urlopen(imgUrl).read()) | |
print("下载 第 " + str(index + 1) + " 张图片") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment