Created
December 30, 2015 01:16
-
-
Save aajjbb/89d1cc8d3d5a87137f79 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
aajjbb | |
python3 script based on ricbit script to dowload all books from computer science section of Springer | |
It has BeautifulSoup as dependency, easy installed with 'pip install beautifulsoup4' | |
""" | |
import re, subprocess, time, sys | |
from urllib.request import urlopen | |
from bs4 import BeautifulSoup | |
pmax = 456 | |
target_file = open("test.txt", "w") | |
ROOT_URL = 'http://link.springer.com/search/page/%d?facet-discipline="Computer+Science"&showAll=false&facet-language="En"&facet-content-type="Book' | |
def getBookInfo(url): | |
info = BeautifulSoup(str(url), "html.parser").a | |
book_title = info.string | |
page_url = info['href'] | |
return { 'book_title': book_title if book_title else page_url, 'page_url': page_url } | |
def getDownloadUrl(url): | |
page = BeautifulSoup(urlopen(url), "html.parser") | |
name = page.find("h1", id="title").string | |
pdf = page.find_all("a") | |
for link in pdf: | |
link_data = BeautifulSoup(str(link), "html.parser").a | |
try: | |
tag_id = str(link_data['id']) | |
if tag_id.startswith("toc-download-book-pdf"): | |
pdf = str(link_data['href']) | |
break | |
except: | |
pass | |
return None if type(pdf) != str else "http://link.springer.com" + pdf | |
for i in range(1, pmax): | |
index = ROOT_URL % i | |
page = urlopen(index) | |
soup = BeautifulSoup(page, 'html.parser') | |
urls = soup.find_all("a", class_="title") | |
for url in urls: | |
book_info = getBookInfo(url) | |
download_url = getDownloadUrl('http://link.springer.com' + book_info['page_url']) | |
if download_url: | |
ret = subprocess.Popen(str("wget %s -O %s" % (download_url, str(book_info['book_title']).replace(' ', '-'))).split()) | |
ret.wait() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment