Last active
March 13, 2021 05:17
-
-
Save isaacgr/c2458ea740aa1e3c2645c6c3920e8933 to your computer and use it in GitHub Desktop.
Python with Selenium code to scrape quantamagazine.com and save articles to pdf using chromedriver
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from gql import gql, Client | |
from gql.transport.requests import RequestsHTTPTransport | |
from selenium import webdriver | |
from selenium.webdriver.support.ui import WebDriverWait | |
import os | |
import json | |
import sys | |
import subprocess | |
SPECIAL_CHARS = ['\\', '/', ':', '?', '*', '<', '>', '|'] | |
CHROMEDRIVER_PATH = 'chromedriver.exe' | |
class QuantaScrape(object): | |
url="https://www.quantamagazine.org/graphql" | |
headers={"Content-type": "application/json"} | |
def __init__(self): | |
self.transport = None | |
self.client = None | |
def define_transport(self): | |
self.transport = RequestsHTTPTransport( | |
use_json=True, | |
url=self.url, | |
verify=False, | |
headers=self.headers, | |
retries=3 | |
) | |
def define_client(self): | |
self.client = Client(transport=self.transport, fetch_schema_from_transport=True) | |
def execute(self, query, variables): | |
self.define_transport() | |
self.define_client() | |
return self.client.execute(query, variable_values=variables) | |
query = gql( | |
""" | |
query ($offset: Int){ | |
operationName: getPostPageArchive(offset: $offset, type: "archive"){ | |
meta{ | |
max_num_pages | |
} | |
data{ | |
...on Post{ | |
title | |
link | |
} | |
} | |
} | |
} | |
""" | |
) | |
def print_to_pdf(title, link): | |
appState = { | |
"recentDestinations": [{ | |
"id": "Save as PDF", | |
"origin": "local", | |
"account": "", | |
}], | |
"selectedDestinationId": "Save as PDF", | |
"version": 2 | |
} | |
prefs = { | |
'printing.print_preview_sticky_settings.appState': json.dumps(appState) | |
} | |
chrome_options = webdriver.ChromeOptions() | |
chrome_options.add_experimental_option('prefs', prefs) | |
chrome_options.add_argument('--kiosk-printing') | |
driver = webdriver.Chrome(executable_path=CHROMEDRIVER_PATH, options=chrome_options) | |
driver.get(link) | |
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") | |
WebDriverWait(driver, 20) | |
driver.execute_script('window.print();') | |
driver.quit() | |
def download_pdf(filename): | |
with open(filename) as f: | |
data = json.load(f) | |
for article in data: | |
title = '_'.join(article['title'].split(' ')) | |
link = article['link'] | |
for char in title: | |
if char in SPECIAL_CHARS: | |
title = title.replace(char, '') | |
print(title) | |
print_to_pdf(title, link) | |
def main(): | |
articles = [] | |
scraper = QuantaScrape() | |
offset = 1 | |
data = scraper.execute(query, {"offset": offset}) | |
for pages in range(1, data['operationName']['meta']['max_num_pages']+1): | |
data = scraper.execute(query, {"offset": offset}) | |
articles.extend(data['operationName']['data']) | |
offset += 1 | |
with open('articles.json', 'w') as f: | |
f.write(json.dumps(articles, indent=4)) | |
if __name__=='__main__': | |
if sys.argv[1] == 'download': | |
download_pdf(sys.argv[2]) | |
else: | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment