Skip to content

Instantly share code, notes, and snippets.

@aajjbb
Created December 30, 2015 01:16
Show Gist options
  • Save aajjbb/89d1cc8d3d5a87137f79 to your computer and use it in GitHub Desktop.
Save aajjbb/89d1cc8d3d5a87137f79 to your computer and use it in GitHub Desktop.
"""
aajjbb
python3 script based on ricbit script to dowload all books from computer science section of Springer
It has BeautifulSoup as dependency, easy installed with 'pip install beautifulsoup4'
"""
import re, subprocess, time, sys
from urllib.request import urlopen
from bs4 import BeautifulSoup
pmax = 456
target_file = open("test.txt", "w")
ROOT_URL = 'http://link.springer.com/search/page/%d?facet-discipline="Computer+Science"&showAll=false&facet-language="En"&facet-content-type="Book'
def getBookInfo(url):
info = BeautifulSoup(str(url), "html.parser").a
book_title = info.string
page_url = info['href']
return { 'book_title': book_title if book_title else page_url, 'page_url': page_url }
def getDownloadUrl(url):
page = BeautifulSoup(urlopen(url), "html.parser")
name = page.find("h1", id="title").string
pdf = page.find_all("a")
for link in pdf:
link_data = BeautifulSoup(str(link), "html.parser").a
try:
tag_id = str(link_data['id'])
if tag_id.startswith("toc-download-book-pdf"):
pdf = str(link_data['href'])
break
except:
pass
return None if type(pdf) != str else "http://link.springer.com" + pdf
for i in range(1, pmax):
index = ROOT_URL % i
page = urlopen(index)
soup = BeautifulSoup(page, 'html.parser')
urls = soup.find_all("a", class_="title")
for url in urls:
book_info = getBookInfo(url)
download_url = getDownloadUrl('http://link.springer.com' + book_info['page_url'])
if download_url:
ret = subprocess.Popen(str("wget %s -O %s" % (download_url, str(book_info['book_title']).replace(' ', '-'))).split())
ret.wait()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment