Created
July 26, 2018 04:45
-
-
Save riptl/d116c8a544937a9b8e0768e530db3a1f to your computer and use it in GitHub Desktop.
download ARM PDFs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# by terorie, 2018 | |
# sorry ARM | |
import os | |
import requests | |
import lxml.html | |
import shutil | |
DUMP_DIR = 'armpdf' | |
s = requests.Session() | |
r = s.get('https://developer.arm.com/docs') | |
doc = lxml.html.fromstring(r.text) | |
links = doc.cssselect('a') | |
del r | |
print("Got PDF list:", len(links), "links") | |
if not os.path.exists(DUMP_DIR): | |
os.makedirs(DUMP_DIR) | |
for link in links: | |
url = link.get('href') | |
if url is None: | |
continue | |
if not url.startswith('/docs/'): | |
continue | |
pdf_page_url = 'https://developer.arm.com' + url | |
pdf_page_r = s.get(pdf_page_url) | |
pdf_page_doc = lxml.html.fromstring(pdf_page_r.text) | |
del pdf_page_r | |
pdf_page_links = pdf_page_doc.find_class('download-pdf') | |
if len(pdf_page_links) == 0: | |
continue | |
pdf_page_link = pdf_page_links[0] | |
pdf_page_as = pdf_page_link.cssselect('a') | |
if len(pdf_page_as) == 0: | |
continue | |
pdf_page_a = pdf_page_as[0] | |
pdf_page_a.make_links_absolute(pdf_page_url) | |
pdf_url = pdf_page_a.get('href') | |
_, _, pdf_name = pdf_url.rpartition('/') | |
if os.path.exists(pdf_name): | |
continue | |
print(pdf_url) | |
# Verify fails on macOS High Sierra :( | |
pdf_r = s.get(pdf_url, stream=True, verify=False) | |
with open(os.path.join(DUMP_DIR, pdf_name), 'wb') as pdf_file: | |
shutil.copyfileobj(pdf_r.raw, pdf_file) | |
del pdf_r |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Host
https://static.docs.arm.com
gets a SSL verify error in programs installed with Homebrew but work in native apps.python
andpyenv
andwget
via Homebrewraise SSLError
, Safari andcurl
work.Thats why
verify=False
in line 53 is set.Any help would be appreciated.