-
-
Save flyfire/02c2e3a965f34f6f77db0c43f2088ef9 to your computer and use it in GitHub Desktop.
Create screenshots of articles; work for WeChat articles with lazy loading.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
''' | |
Usage: python archive_articles.py test.csv | |
Input: test.csv | |
name url | |
1 url1 | |
2 url2 | |
..... | |
output: | |
1.png | |
2.png | |
..... | |
Dependency: | |
Firefox | |
pip install selenium | |
pip install pillow (for image compression) | |
brew install geckodriver (for mac) | |
''' | |
import time | |
from selenium import webdriver | |
from selenium.webdriver.chrome.options import Options as ChromeOptions | |
from selenium.webdriver.firefox.options import Options as FirefoxOptions | |
import csv | |
import sys | |
from PIL import Image | |
import math | |
def fullpage_screenshot(nameAndURL): | |
# Only work with Firefox; Chrome does not work | |
options = webdriver.FirefoxOptions() | |
options.add_argument("--headless"); | |
driver = webdriver.Firefox(options=options) | |
driver.maximize_window() | |
for pageInfo in nameAndURL: | |
name = pageInfo[0] | |
url = pageInfo[1] | |
print('Capturing: ', name, url) | |
# URL here | |
driver.get(url) | |
time.sleep(2) | |
height = driver.execute_script("return document.body.scrollHeight") | |
# The trick for lazy loading of images in WeChat articles | |
driver.set_window_size(1000, height - 1000) | |
driver.execute_script("window.scrollTo(0, 1000)") | |
driver.execute_script("window.scrollTo(0, 0)") | |
time.sleep(10) # new images need time to load | |
new_height = driver.execute_script("return document.body.scrollHeight") | |
driver.set_window_size(1000, new_height) | |
driver.save_screenshot(name+".png") | |
# Optimize image | |
foo = Image.open(name+".png") | |
foo.save(name+".png",optimize=True,quality=95) | |
driver.quit() | |
if __name__ == "__main__": | |
assert len(sys.argv) == 2 | |
nameAndURL = [] | |
with open(sys.argv[1]) as f: | |
lis = [line.split(' ') for line in f] | |
for i, x in enumerate(lis): | |
if i != 0: | |
nameAndURL.append(x) | |
fullpage_screenshot(nameAndURL) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment