Created
August 28, 2017 00:02
-
-
Save MainasuK/4700fd0d3e5adfbf8e27198272fdf950 to your computer and use it in GitHub Desktop.
Apple document pages spider
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
import sys | |
from bs4 import BeautifulSoup | |
from sys import stdin | |
# usage: | |
# echo 'https://developer.apple.com/library/content/documentation/NetworkingInternetWeb/Conceptual/SafariAppExtension_PG/index.html' | python3 document_urls.py | |
# Use this script get link of pages for *one* apple programming guide document | |
def parse_next_page_url(currentURL): | |
response = requests.get(currentURL) | |
soup = BeautifulSoup(response.content, 'html.parser') | |
# a tag style (old style document) | |
# https://developer.apple.com/library/content/documentation/AudioVideo/Conceptual/MediaPlaybackGuide/Contents/Resources/en.lproj/RevisionHistory.html | |
for aTag in soup.findAll('a', { 'class' : 'nextLink' }): | |
try: | |
href = aTag['href'] | |
nextHTML = href.split('#')[0] | |
nextHTML = href.split('/', 1)[1] | |
base = currentURL.rsplit('/', 2)[0] | |
nextURL = ''.join([base, '/', nextHTML]) | |
print(nextURL) | |
return nextURL | |
except IndexError: | |
return "" | |
# p Tag style (some document after 2016) | |
# https://developer.apple.com/library/content/documentation/NetworkingInternetWeb/Conceptual/SafariAppExtension_PG/index.html | |
for pTag in soup.findAll('p', { 'class' : 'next-link' }): | |
try: | |
aTag = pTag.findAll('a').pop() | |
href = aTag['href'] | |
nextHTML = href.split('#')[0] | |
base = currentURL.rsplit('/', 1)[0] | |
nextURL = ''.join([base, '/', nextHTML]) | |
print(nextURL) | |
return nextURL | |
except IndexError: | |
return "" | |
# Debug | |
# print(response.content) | |
# print('Parse failed for: ' + currentURL) | |
return "" | |
def main(argv = None): | |
if argv is None: | |
argv = sys.argv | |
# p tag style: | |
# currentURL = 'https://developer.apple.com/library/content/documentation/NetworkingInternetWeb/Conceptual/SafariAppExtension_PG/index.html' | |
currentURL = stdin.readline().strip() | |
urls = [currentURL] | |
while True: | |
currentURL = parse_next_page_url(currentURL) | |
if "" != currentURL: | |
urls.append(currentURL) | |
continue | |
else: | |
break | |
with open('document_urls.txt', 'w') as file: | |
for url in urls: | |
file.write("%s\n" % url) | |
if __name__ == "__main__": | |
sys.exit(main()) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Automator script:
Save URLs as PDFs (via Safari):
https://www.dropbox.com/s/mrotz0ymmvwfi5q/Save%20URLs%20as%20PDFs.workflow.zip?dl=0
merge PDF
https://www.dropbox.com/s/1tsu2rt5jo02apk/Merge%20PDF.workflow.zip?dl=0
Before use that script. Modify literal string to your language first. :D